From d99d235123d43825f35cdc4c8cb49339d9651056 Mon Sep 17 00:00:00 2001
From: zmc <zmc_li@foxmail.com>
Date: 星期五, 22 十二月 2023 11:56:13 +0800
Subject: [PATCH] 1.修改了创建连接数据库引擎时的方言类型 2.修改了爬取数据的逻辑以及相关的异常分析代码

---
 src/Crawling.py |   57 +++++++++++++++++++++------------------------------------
 1 files changed, 21 insertions(+), 36 deletions(-)

diff --git a/src/Crawling.py b/src/Crawling.py
index 0610702..fa332bd 100644
--- a/src/Crawling.py
+++ b/src/Crawling.py
@@ -120,7 +120,8 @@
         count_all=count_all+get_OnePage(i,count)
         if len(list_temp)==0:        #濡傛灉璇ラ〉涓虹┖锛屽垯琛ㄧず璇ラ〉鍚庨潰閮芥棤鏁版嵁  閫�鍑哄惊鐜�
             print('鍚庨潰椤垫暟涓虹┖锛岀埇鍘讳笅涓�涓簵閾�')
-            break                    #閫�鍑哄惊鐜�
+            break 
+                           #閫�鍑哄惊鐜�
         list_all.extend(list_temp)   #灏嗕竴椤垫暟鎹垪琛ㄨ拷鍔犲埌list_all涓�
         print("鐖彇浜嗙",page,"椤�")
         page=page+1
@@ -213,7 +214,7 @@
     data = pd.DataFrame(list,columns=['dev_id','exception','exception_type','region','begin_time','end_time'])
     print("\n\n")
     print(data)
-    # engine = create_engine("mysql+mysqlconnector://root:1234@localhost:3306/fume?charset=utf8")
+    # engine = create_engine("mysql+pymysql://root:1234@localhost:3306/fume?charset=utf8")
     # con = engine.connect()
 
     # test3 瑕佸啓鍏ョ殑鏁版嵁琛紝杩欐牱鍐欑殑璇濊鎻愬墠鍦ㄦ暟鎹簱寤哄ソ琛�
@@ -380,7 +381,7 @@
     data = pd.DataFrame(list,columns=['MV_Stat_Code','MV_Create_Time','MV_Data_Time','MV_Fan_Electricity','MV_Purifier_Electricity','MV_Fume_Concentration','MV_Fume_Concentration2'])
     print("\n\n")
     print(data)
-    #engine = create_engine("mysql+mysqlconnector://root:1234@localhost:3306/fume?charset=utf8")
+    #engine = create_engine("mysql+pymysql://root:1234@localhost:3306/fume?charset=utf8")
     #con = engine.connect()
 
     # test3 瑕佸啓鍏ョ殑鏁版嵁琛紝杩欐牱鍐欑殑璇濊鎻愬墠鍦ㄦ暟鎹簱寤哄ソ琛�
@@ -512,7 +513,7 @@
 def dev_info_data_if_exisitd(list,con):  #list涓虹埇鍙栨煇瀹跺簵閾烘寚瀹氶〉鏁拌浆鎹㈠悗鐨勬暟鎹�
     global con_read
     #鍒涘缓绗簩涓暟鎹簱杩炴帴
-    # engine = create_engine("mysql+mysqlconnector://root:1234@localhost:3306/fume?charset=utf8")
+    # engine = create_engine("mysql+pymysql://root:1234@localhost:3306/fume?charset=utf8")
     # con_read = engine.connect()
 
     df = pd.read_sql('SELECT DI_Code,DI_Name,DI_Supplier FROM ea_t_device_info',con=con_read)   #浠庤澶囦俊鎭〃涓鍙栬澶囩紪鍙凤紝搴楅摵鍚嶏紝渚涘簲鍟嗗瓧娈电殑鏁版嵁銆傝繑鍥炲�兼槸DateFrame绫诲瀷
@@ -604,11 +605,7 @@
     global ck
     global list_temp    #浣跨敤鍏ㄥ眬鍙橀噺
     list_temp.clear()  #娓呯┖涓存椂琛�
-    # session.headers = {
-    #     # 姝ゅ娉ㄦ剰cookie锛岃鑷繁鎶撳彇
-    #     # "Cookie":ck,
-    #     "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36",
-    # }
+
     r = session.get(url, verify=False).text
     soup = bs(r,'html.parser')
 
@@ -686,11 +683,7 @@
     global list_temp    #浣跨敤鍏ㄥ眬鍙橀噺
 
     list_temp.clear()  #娓呯┖涓存椂琛�
-    # session.headers = {
-    #     # 姝ゅ娉ㄦ剰cookie锛岃鑷繁鎶撳彇
-    #     # "Cookie":ck,
-    #     "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36",
-    # }
+
     r = session.get(url,  verify=False).text
     soup = bs(r,'html.parser')
 
@@ -707,7 +700,6 @@
         print('鍒犻櫎鐗规畩鐨勫悗涓や釜')
         print(list1)
         list.append(list1) 
-        #list1.clear()       
 
     #print(list) 
     list_data=[]  
@@ -753,36 +745,27 @@
     urls=url_more()   #杩斿洖鏂囦欢涓墍鏈夊簵閾虹殑url,甯︽渶澶ф樉绀烘潯鏁�100 銆倁rls鏄垪琛�
     #print(urls)
     teshu_url=[]
-    #'shop=%25E9%25A3%259F%25E5%2585%25B6%25E5%25AE%25B6'  椋熷叾瀹�
     special_url=['shop=%25E4%25BA%25BA%25E7%2594%259F%25E4%25B8%2580%25E4%25B8%25B2','shop=%25E7%25BC%2598%25E5%25AE%25B6','shop=%25E4%25B8%25B0%25E8%258C%2582%25E7%2583%25A4%25E4%25B8%25B2','shop=%25E6%25B3%25B0%25E7%2585%258C%25E9%25B8%25A1','shop=%25E5%25B0%258F%25E9%2593%2581%25E5%2590%259B']
 
     for url in urls:          #閬嶅巻鎵�鏈夊簵閾虹殑url
         begin=url.find('&')+1
         end=url.rfind('&')
-        #print(begin,end)
         #鎵惧埌鐗规畩鐨剈rl,杩涜鐗规畩澶勭悊
         if url[begin:end] in special_url:   
             print('鍙戠幇鐗规畩鐨勶紒')
-            already_spider_shopnum += 1   #鐖幓鐨勫簵閾烘暟閲忓姞1
+            already_spider_shopnum += 1   # 鐖幓鐨勫簵閾烘暟閲忓姞1
             teshu_url.append(url)
             #鍐嶅垹闄や竴涓垪琛ㄥ厓绱�
-            url_teshu=url_add_time(url,date_begin,date_end)  #缁欐墍鏈塽rl鍔犱笂鏃ユ湡
-            list_to_MySql=get_MorePages_teshu(url_teshu,page)    #搴楅摵鐨剈rl,鐖彇鐨勯〉鏁�
-            # a=remove_Duplicates_list(list_to_MySql)
-            # print('\n')
-            # for item in a:
-            #     print(item)
+            url_teshu=url_add_time(url,date_begin,date_end)  # 缁欐墍鏈塽rl鍔犱笂鏃ユ湡
+            list_to_MySql=get_MorePages_teshu(url_teshu,page)    # 搴楅摵鐨剈rl,鐖彇鐨勯〉鏁�
+
             if len(list_to_MySql) == 0 :
                 print('璇ュ搴楅摵鏃堕棿娈垫棤鏁版嵁锛屽凡璺宠繃')
                 continue
             has_remove_duplicates = remove_Duplicates_list(list_to_MySql)   #鍘婚櫎鏌愪釜搴楅摵鎸囧畾椤垫暟鎵�鏈夐噸澶嶇殑鏁版嵁
             for item in has_remove_duplicates:
                 all_data.append(item)
-            # is_minutes_exceed_30(has_remove_duplicates,con)      # 灏嗘寚瀹氶〉鏁扮殑璁惧鏁呴殰鏁版嵁鍐欏叆鏁版嵁搴撳紓甯歌〃涓�
-            # isExceeding(has_remove_duplicates,con)               # 灏嗘寚瀹氶〉鏁版暟鎹啓鍏ユ暟鎹簱瓒呮爣琛ㄤ腑
-            # ea_t_dev(has_remove_duplicates,con)                  # 灏嗘寚瀹氶〉鏁版暟鎹啓鍏ユ暟鎹簱璁惧淇℃伅琛ㄤ腑
-            # fd_t_minbute(has_remove_duplicates,con)              #灏嗘寚瀹氶〉鏁版暟鎹啓鍏ユ暟鎹簱鍒嗛挓鏁版嵁琛ㄤ腑 
-
+         
             list_to_MySql.clear()
         if url[begin:end]=='shop=%25E9%25A3%259F%25E5%2585%25B6%25E5%25AE%25B6':   #椋熷叾瀹�
             print('鍙戠幇鐗规畩鐨勶紒')
@@ -800,11 +783,7 @@
             has_remove_duplicates = remove_Duplicates_list(list_to_MySql)   #鍘婚櫎鏌愪釜搴楅摵鎸囧畾椤垫暟鎵�鏈夐噸澶嶇殑鏁版嵁
             for item in has_remove_duplicates:
                 all_data.append(item)
-            # is_minutes_exceed_30(has_remove_duplicates,con)      # 灏嗘寚瀹氶〉鏁扮殑璁惧鏁呴殰鏁版嵁鍐欏叆鏁版嵁搴撳紓甯歌〃涓�
-            # isExceeding(has_remove_duplicates,con)               # 灏嗘寚瀹氶〉鏁版暟鎹啓鍏ユ暟鎹簱瓒呮爣琛ㄤ腑
-            # ea_t_dev(has_remove_duplicates,con)                  # 灏嗘寚瀹氶〉鏁版暟鎹啓鍏ユ暟鎹簱璁惧淇℃伅琛ㄤ腑
-            # fd_t_minbute(has_remove_duplicates,con)              #灏嗘寚瀹氶〉鏁版暟鎹啓鍏ユ暟鎹簱鍒嗛挓鏁版嵁琛ㄤ腑 
-      
+           
             list_to_MySql.clear()
     for t in teshu_url:     #浠巙rls琛ㄤ腑鍒犻櫎鐗规畩鐨�
         urls.remove(t)
@@ -874,7 +853,12 @@
 Key_period_night_begin = datetime.strptime('17:00',"%H:%M")   #鏅氫笂閲嶇偣鏃舵
 Key_period_night_end = datetime.strptime('21:00',"%H:%M")
 
-engine = create_engine("mysql+mysqlconnector://root:1234@localhost:3306/fume?charset=utf8")
+# e1ngine = create_engine("mysql+pymysql://fume:fume_feiyu2023@localhost:3306/fume?charset=utf8")
+engine = create_engine("mysql+pymysql://fumeRemote:feiyu2023@114.215.109.124:3306/fume?charset=utf8")
+
+
+# e1ngine = create_engine("mysql+pymysql:/fumeRemote:feiyu2023@114.215.109.124:3306/fume?charset=utf8")
+
 # 涓撻棬璇诲彇璁惧淇℃伅琛�
 con_read = engine.connect()
 
@@ -903,9 +887,10 @@
     # 鍒濆鍖杝ession
     session = ses
     webshops = shops[:]
+    print(f'鑾峰彇鏁版嵁鐨勬椂闂村尯闂翠负锛歿beginTime}-{endTime}')
     print('浼犲叆鐨勫簵閾哄悕绉颁负锛�',shops)
    
-    engine = create_engine("mysql+mysqlconnector://root:1234@localhost:3306/fume?charset=utf8")
+    engine = create_engine("mysql+pymysql://fumeRemote:feiyu2023@114.215.109.124:3306/fume?charset=utf8")
     con = engine.connect()
 
     # back_cookie()   # 浠庢枃浠朵腑璇诲彇cookie

--
Gitblit v1.9.3