From 9fdbf60165db0400c2e8e6be2dc6e88138ac719a Mon Sep 17 00:00:00 2001 From: zmc <zmc_li@foxmail.com> Date: 星期五, 22 十二月 2023 13:28:11 +0800 Subject: [PATCH] Merge branch 'master' of ssh://114.215.109.124:29418/fume-manage-python --- src/Crawling.py | 41 ++++++++++------------------------------- 1 files changed, 10 insertions(+), 31 deletions(-) diff --git a/src/Crawling.py b/src/Crawling.py index 448d8db..fa332bd 100644 --- a/src/Crawling.py +++ b/src/Crawling.py @@ -120,7 +120,8 @@ count_all=count_all+get_OnePage(i,count) if len(list_temp)==0: #濡傛灉璇ラ〉涓虹┖锛屽垯琛ㄧず璇ラ〉鍚庨潰閮芥棤鏁版嵁 閫�鍑哄惊鐜� print('鍚庨潰椤垫暟涓虹┖锛岀埇鍘讳笅涓�涓簵閾�') - break #閫�鍑哄惊鐜� + break + #閫�鍑哄惊鐜� list_all.extend(list_temp) #灏嗕竴椤垫暟鎹垪琛ㄨ拷鍔犲埌list_all涓� print("鐖彇浜嗙",page,"椤�") page=page+1 @@ -604,11 +605,7 @@ global ck global list_temp #浣跨敤鍏ㄥ眬鍙橀噺 list_temp.clear() #娓呯┖涓存椂琛� - # session.headers = { - # # 姝ゅ娉ㄦ剰cookie锛岃鑷繁鎶撳彇 - # # "Cookie":ck, - # "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36", - # } + r = session.get(url, verify=False).text soup = bs(r,'html.parser') @@ -686,11 +683,7 @@ global list_temp #浣跨敤鍏ㄥ眬鍙橀噺 list_temp.clear() #娓呯┖涓存椂琛� - # session.headers = { - # # 姝ゅ娉ㄦ剰cookie锛岃鑷繁鎶撳彇 - # # "Cookie":ck, - # "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36", - # } + r = session.get(url, verify=False).text soup = bs(r,'html.parser') @@ -707,7 +700,6 @@ print('鍒犻櫎鐗规畩鐨勫悗涓や釜') print(list1) list.append(list1) - #list1.clear() #print(list) list_data=[] @@ -753,36 +745,27 @@ urls=url_more() #杩斿洖鏂囦欢涓墍鏈夊簵閾虹殑url,甯︽渶澶ф樉绀烘潯鏁�100 銆倁rls鏄垪琛� #print(urls) teshu_url=[] - #'shop=%25E9%25A3%259F%25E5%2585%25B6%25E5%25AE%25B6' 椋熷叾瀹� special_url=['shop=%25E4%25BA%25BA%25E7%2594%259F%25E4%25B8%2580%25E4%25B8%25B2','shop=%25E7%25BC%2598%25E5%25AE%25B6','shop=%25E4%25B8%25B0%25E8%258C%2582%25E7%2583%25A4%25E4%25B8%25B2','shop=%25E6%25B3%25B0%25E7%2585%258C%25E9%25B8%25A1','shop=%25E5%25B0%258F%25E9%2593%2581%25E5%2590%259B'] for url in urls: #閬嶅巻鎵�鏈夊簵閾虹殑url begin=url.find('&')+1 end=url.rfind('&') - #print(begin,end) #鎵惧埌鐗规畩鐨剈rl,杩涜鐗规畩澶勭悊 if url[begin:end] in special_url: print('鍙戠幇鐗规畩鐨勶紒') - already_spider_shopnum += 1 #鐖幓鐨勫簵閾烘暟閲忓姞1 + already_spider_shopnum += 1 # 鐖幓鐨勫簵閾烘暟閲忓姞1 teshu_url.append(url) #鍐嶅垹闄や竴涓垪琛ㄥ厓绱� - url_teshu=url_add_time(url,date_begin,date_end) #缁欐墍鏈塽rl鍔犱笂鏃ユ湡 - list_to_MySql=get_MorePages_teshu(url_teshu,page) #搴楅摵鐨剈rl,鐖彇鐨勯〉鏁� - # a=remove_Duplicates_list(list_to_MySql) - # print('\n') - # for item in a: - # print(item) + url_teshu=url_add_time(url,date_begin,date_end) # 缁欐墍鏈塽rl鍔犱笂鏃ユ湡 + list_to_MySql=get_MorePages_teshu(url_teshu,page) # 搴楅摵鐨剈rl,鐖彇鐨勯〉鏁� + if len(list_to_MySql) == 0 : print('璇ュ搴楅摵鏃堕棿娈垫棤鏁版嵁锛屽凡璺宠繃') continue has_remove_duplicates = remove_Duplicates_list(list_to_MySql) #鍘婚櫎鏌愪釜搴楅摵鎸囧畾椤垫暟鎵�鏈夐噸澶嶇殑鏁版嵁 for item in has_remove_duplicates: all_data.append(item) - # is_minutes_exceed_30(has_remove_duplicates,con) # 灏嗘寚瀹氶〉鏁扮殑璁惧鏁呴殰鏁版嵁鍐欏叆鏁版嵁搴撳紓甯歌〃涓� - # isExceeding(has_remove_duplicates,con) # 灏嗘寚瀹氶〉鏁版暟鎹啓鍏ユ暟鎹簱瓒呮爣琛ㄤ腑 - # ea_t_dev(has_remove_duplicates,con) # 灏嗘寚瀹氶〉鏁版暟鎹啓鍏ユ暟鎹簱璁惧淇℃伅琛ㄤ腑 - # fd_t_minbute(has_remove_duplicates,con) #灏嗘寚瀹氶〉鏁版暟鎹啓鍏ユ暟鎹簱鍒嗛挓鏁版嵁琛ㄤ腑 - + list_to_MySql.clear() if url[begin:end]=='shop=%25E9%25A3%259F%25E5%2585%25B6%25E5%25AE%25B6': #椋熷叾瀹� print('鍙戠幇鐗规畩鐨勶紒') @@ -800,11 +783,7 @@ has_remove_duplicates = remove_Duplicates_list(list_to_MySql) #鍘婚櫎鏌愪釜搴楅摵鎸囧畾椤垫暟鎵�鏈夐噸澶嶇殑鏁版嵁 for item in has_remove_duplicates: all_data.append(item) - # is_minutes_exceed_30(has_remove_duplicates,con) # 灏嗘寚瀹氶〉鏁扮殑璁惧鏁呴殰鏁版嵁鍐欏叆鏁版嵁搴撳紓甯歌〃涓� - # isExceeding(has_remove_duplicates,con) # 灏嗘寚瀹氶〉鏁版暟鎹啓鍏ユ暟鎹簱瓒呮爣琛ㄤ腑 - # ea_t_dev(has_remove_duplicates,con) # 灏嗘寚瀹氶〉鏁版暟鎹啓鍏ユ暟鎹簱璁惧淇℃伅琛ㄤ腑 - # fd_t_minbute(has_remove_duplicates,con) #灏嗘寚瀹氶〉鏁版暟鎹啓鍏ユ暟鎹簱鍒嗛挓鏁版嵁琛ㄤ腑 - + list_to_MySql.clear() for t in teshu_url: #浠巙rls琛ㄤ腑鍒犻櫎鐗规畩鐨� urls.remove(t) -- Gitblit v1.9.3