From d99d235123d43825f35cdc4c8cb49339d9651056 Mon Sep 17 00:00:00 2001 From: zmc <zmc_li@foxmail.com> Date: 星期五, 22 十二月 2023 11:56:13 +0800 Subject: [PATCH] 1.修改了创建连接数据库引擎时的方言类型 2.修改了爬取数据的逻辑以及相关的异常分析代码 --- src/Crawling.py | 57 +++++++++++++++++++++------------------------------------ 1 files changed, 21 insertions(+), 36 deletions(-) diff --git a/src/Crawling.py b/src/Crawling.py index 0610702..fa332bd 100644 --- a/src/Crawling.py +++ b/src/Crawling.py @@ -120,7 +120,8 @@ count_all=count_all+get_OnePage(i,count) if len(list_temp)==0: #濡傛灉璇ラ〉涓虹┖锛屽垯琛ㄧず璇ラ〉鍚庨潰閮芥棤鏁版嵁 閫�鍑哄惊鐜� print('鍚庨潰椤垫暟涓虹┖锛岀埇鍘讳笅涓�涓簵閾�') - break #閫�鍑哄惊鐜� + break + #閫�鍑哄惊鐜� list_all.extend(list_temp) #灏嗕竴椤垫暟鎹垪琛ㄨ拷鍔犲埌list_all涓� print("鐖彇浜嗙",page,"椤�") page=page+1 @@ -213,7 +214,7 @@ data = pd.DataFrame(list,columns=['dev_id','exception','exception_type','region','begin_time','end_time']) print("\n\n") print(data) - # engine = create_engine("mysql+mysqlconnector://root:1234@localhost:3306/fume?charset=utf8") + # engine = create_engine("mysql+pymysql://root:1234@localhost:3306/fume?charset=utf8") # con = engine.connect() # test3 瑕佸啓鍏ョ殑鏁版嵁琛紝杩欐牱鍐欑殑璇濊鎻愬墠鍦ㄦ暟鎹簱寤哄ソ琛� @@ -380,7 +381,7 @@ data = pd.DataFrame(list,columns=['MV_Stat_Code','MV_Create_Time','MV_Data_Time','MV_Fan_Electricity','MV_Purifier_Electricity','MV_Fume_Concentration','MV_Fume_Concentration2']) print("\n\n") print(data) - #engine = create_engine("mysql+mysqlconnector://root:1234@localhost:3306/fume?charset=utf8") + #engine = create_engine("mysql+pymysql://root:1234@localhost:3306/fume?charset=utf8") #con = engine.connect() # test3 瑕佸啓鍏ョ殑鏁版嵁琛紝杩欐牱鍐欑殑璇濊鎻愬墠鍦ㄦ暟鎹簱寤哄ソ琛� @@ -512,7 +513,7 @@ def dev_info_data_if_exisitd(list,con): #list涓虹埇鍙栨煇瀹跺簵閾烘寚瀹氶〉鏁拌浆鎹㈠悗鐨勬暟鎹� global con_read #鍒涘缓绗簩涓暟鎹簱杩炴帴 - # engine = create_engine("mysql+mysqlconnector://root:1234@localhost:3306/fume?charset=utf8") + # engine = create_engine("mysql+pymysql://root:1234@localhost:3306/fume?charset=utf8") # con_read = engine.connect() df = pd.read_sql('SELECT DI_Code,DI_Name,DI_Supplier FROM ea_t_device_info',con=con_read) #浠庤澶囦俊鎭〃涓鍙栬澶囩紪鍙凤紝搴楅摵鍚嶏紝渚涘簲鍟嗗瓧娈电殑鏁版嵁銆傝繑鍥炲�兼槸DateFrame绫诲瀷 @@ -604,11 +605,7 @@ global ck global list_temp #浣跨敤鍏ㄥ眬鍙橀噺 list_temp.clear() #娓呯┖涓存椂琛� - # session.headers = { - # # 姝ゅ娉ㄦ剰cookie锛岃鑷繁鎶撳彇 - # # "Cookie":ck, - # "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36", - # } + r = session.get(url, verify=False).text soup = bs(r,'html.parser') @@ -686,11 +683,7 @@ global list_temp #浣跨敤鍏ㄥ眬鍙橀噺 list_temp.clear() #娓呯┖涓存椂琛� - # session.headers = { - # # 姝ゅ娉ㄦ剰cookie锛岃鑷繁鎶撳彇 - # # "Cookie":ck, - # "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36", - # } + r = session.get(url, verify=False).text soup = bs(r,'html.parser') @@ -707,7 +700,6 @@ print('鍒犻櫎鐗规畩鐨勫悗涓や釜') print(list1) list.append(list1) - #list1.clear() #print(list) list_data=[] @@ -753,36 +745,27 @@ urls=url_more() #杩斿洖鏂囦欢涓墍鏈夊簵閾虹殑url,甯︽渶澶ф樉绀烘潯鏁�100 銆倁rls鏄垪琛� #print(urls) teshu_url=[] - #'shop=%25E9%25A3%259F%25E5%2585%25B6%25E5%25AE%25B6' 椋熷叾瀹� special_url=['shop=%25E4%25BA%25BA%25E7%2594%259F%25E4%25B8%2580%25E4%25B8%25B2','shop=%25E7%25BC%2598%25E5%25AE%25B6','shop=%25E4%25B8%25B0%25E8%258C%2582%25E7%2583%25A4%25E4%25B8%25B2','shop=%25E6%25B3%25B0%25E7%2585%258C%25E9%25B8%25A1','shop=%25E5%25B0%258F%25E9%2593%2581%25E5%2590%259B'] for url in urls: #閬嶅巻鎵�鏈夊簵閾虹殑url begin=url.find('&')+1 end=url.rfind('&') - #print(begin,end) #鎵惧埌鐗规畩鐨剈rl,杩涜鐗规畩澶勭悊 if url[begin:end] in special_url: print('鍙戠幇鐗规畩鐨勶紒') - already_spider_shopnum += 1 #鐖幓鐨勫簵閾烘暟閲忓姞1 + already_spider_shopnum += 1 # 鐖幓鐨勫簵閾烘暟閲忓姞1 teshu_url.append(url) #鍐嶅垹闄や竴涓垪琛ㄥ厓绱� - url_teshu=url_add_time(url,date_begin,date_end) #缁欐墍鏈塽rl鍔犱笂鏃ユ湡 - list_to_MySql=get_MorePages_teshu(url_teshu,page) #搴楅摵鐨剈rl,鐖彇鐨勯〉鏁� - # a=remove_Duplicates_list(list_to_MySql) - # print('\n') - # for item in a: - # print(item) + url_teshu=url_add_time(url,date_begin,date_end) # 缁欐墍鏈塽rl鍔犱笂鏃ユ湡 + list_to_MySql=get_MorePages_teshu(url_teshu,page) # 搴楅摵鐨剈rl,鐖彇鐨勯〉鏁� + if len(list_to_MySql) == 0 : print('璇ュ搴楅摵鏃堕棿娈垫棤鏁版嵁锛屽凡璺宠繃') continue has_remove_duplicates = remove_Duplicates_list(list_to_MySql) #鍘婚櫎鏌愪釜搴楅摵鎸囧畾椤垫暟鎵�鏈夐噸澶嶇殑鏁版嵁 for item in has_remove_duplicates: all_data.append(item) - # is_minutes_exceed_30(has_remove_duplicates,con) # 灏嗘寚瀹氶〉鏁扮殑璁惧鏁呴殰鏁版嵁鍐欏叆鏁版嵁搴撳紓甯歌〃涓� - # isExceeding(has_remove_duplicates,con) # 灏嗘寚瀹氶〉鏁版暟鎹啓鍏ユ暟鎹簱瓒呮爣琛ㄤ腑 - # ea_t_dev(has_remove_duplicates,con) # 灏嗘寚瀹氶〉鏁版暟鎹啓鍏ユ暟鎹簱璁惧淇℃伅琛ㄤ腑 - # fd_t_minbute(has_remove_duplicates,con) #灏嗘寚瀹氶〉鏁版暟鎹啓鍏ユ暟鎹簱鍒嗛挓鏁版嵁琛ㄤ腑 - + list_to_MySql.clear() if url[begin:end]=='shop=%25E9%25A3%259F%25E5%2585%25B6%25E5%25AE%25B6': #椋熷叾瀹� print('鍙戠幇鐗规畩鐨勶紒') @@ -800,11 +783,7 @@ has_remove_duplicates = remove_Duplicates_list(list_to_MySql) #鍘婚櫎鏌愪釜搴楅摵鎸囧畾椤垫暟鎵�鏈夐噸澶嶇殑鏁版嵁 for item in has_remove_duplicates: all_data.append(item) - # is_minutes_exceed_30(has_remove_duplicates,con) # 灏嗘寚瀹氶〉鏁扮殑璁惧鏁呴殰鏁版嵁鍐欏叆鏁版嵁搴撳紓甯歌〃涓� - # isExceeding(has_remove_duplicates,con) # 灏嗘寚瀹氶〉鏁版暟鎹啓鍏ユ暟鎹簱瓒呮爣琛ㄤ腑 - # ea_t_dev(has_remove_duplicates,con) # 灏嗘寚瀹氶〉鏁版暟鎹啓鍏ユ暟鎹簱璁惧淇℃伅琛ㄤ腑 - # fd_t_minbute(has_remove_duplicates,con) #灏嗘寚瀹氶〉鏁版暟鎹啓鍏ユ暟鎹簱鍒嗛挓鏁版嵁琛ㄤ腑 - + list_to_MySql.clear() for t in teshu_url: #浠巙rls琛ㄤ腑鍒犻櫎鐗规畩鐨� urls.remove(t) @@ -874,7 +853,12 @@ Key_period_night_begin = datetime.strptime('17:00',"%H:%M") #鏅氫笂閲嶇偣鏃舵 Key_period_night_end = datetime.strptime('21:00',"%H:%M") -engine = create_engine("mysql+mysqlconnector://root:1234@localhost:3306/fume?charset=utf8") +# e1ngine = create_engine("mysql+pymysql://fume:fume_feiyu2023@localhost:3306/fume?charset=utf8") +engine = create_engine("mysql+pymysql://fumeRemote:feiyu2023@114.215.109.124:3306/fume?charset=utf8") + + +# e1ngine = create_engine("mysql+pymysql:/fumeRemote:feiyu2023@114.215.109.124:3306/fume?charset=utf8") + # 涓撻棬璇诲彇璁惧淇℃伅琛� con_read = engine.connect() @@ -903,9 +887,10 @@ # 鍒濆鍖杝ession session = ses webshops = shops[:] + print(f'鑾峰彇鏁版嵁鐨勬椂闂村尯闂翠负锛歿beginTime}-{endTime}') print('浼犲叆鐨勫簵閾哄悕绉颁负锛�',shops) - engine = create_engine("mysql+mysqlconnector://root:1234@localhost:3306/fume?charset=utf8") + engine = create_engine("mysql+pymysql://fumeRemote:feiyu2023@114.215.109.124:3306/fume?charset=utf8") con = engine.connect() # back_cookie() # 浠庢枃浠朵腑璇诲彇cookie -- Gitblit v1.9.3