From d03320352e3b34f2601aa8af9fd800a27c1adada Mon Sep 17 00:00:00 2001 From: zmc <zmc_li@foxmail.com> Date: 星期五, 22 十二月 2023 15:42:57 +0800 Subject: [PATCH] 1.修改了创建连接数据库引擎时的方言类型 2.修改了爬取数据的逻辑以及相关的异常分析代码 --- /dev/null | 898 ----------------------------------------------------------- src/__pycache__/auto_login.cpython-38.pyc | 0 src/core_modules/__pycache__/remove_duplicates_methods.cpython-38.pyc | 0 src/__pycache__/Crawling_1.cpython-38.pyc | 0 Vcode.jpg | 0 src/__pycache__/write_to_MySql.cpython-38.pyc | 0 src/__pycache__/write_to_minute_table.cpython-38.pyc | 0 src/user/__pycache__/account_photo.cpython-38.pyc | 0 8 files changed, 0 insertions(+), 898 deletions(-) diff --git a/Vcode.jpg b/Vcode.jpg index ed594ca..3a87104 100644 --- a/Vcode.jpg +++ b/Vcode.jpg Binary files differ diff --git a/main.py b/main.py deleted file mode 100644 index 93fc643..0000000 --- a/main.py +++ /dev/null @@ -1,898 +0,0 @@ -#sum 澶氶〉 鍏ュ簱鎴愬姛 鐖彇鏂囦欢涓墍鏈夌殑搴楅摵 缃戦〉瀹屾暣琛ㄧ 鍘婚櫎閲嶅鏁版嵁 閬囧埌绌洪〉闈細璺冲埌涓嬩竴瀹跺簵閾� 銆傞亣鍒版煇瀹跺簵閾烘棤鏁版嵁锛岃烦杩囧幓涓嬩竴瀹� -#鐖幓鏌愬搴楅摵鎸囧畾鐨勯〉鏁帮紙涓�椤佃褰曟暟榛樿澶у皬涓�100鏉★級锛屾瘮濡傜埇鍙�12椤碉紝鍒欑埇鍙�12椤靛悗灏嗙粨鏋滀竴娆℃�у啓鍏ユ暟鎹簱 -#鐖幓鏁翠釜椤甸潰琛ㄧ粨鏋勶紝鍐嶅垎鍒啓鍏�4寮犺〃涓紙鐖彇鐨勬暟鎹瓨鍏ヤ袱寮犺〃涓紝杩樻湁瓒呮爣琛� 寮傚父琛級 -#缃戦〉涓婂瓧娈靛叡14涓紝瀛樺叆鏁版嵁搴撴槸15涓紙搴忓彿+14锛� -import requests -from bs4 import BeautifulSoup as bs -from aip import AipOcr #鐧惧害鏂囧瓧璇嗗埆 -import re #姝e垯琛ㄨ揪寮� -from pymysql import * # 杩炴帴mysql鏁版嵁搴� -import pandas as pd -from sqlalchemy import create_engine -import urllib.parse #url鍙岄噸缂栫爜 -import time -import uuid -from datetime import datetime, timedelta - -import sys -sys.path.append('D:\\z\workplace\\VsCode\\show') -import core_modules.remove_duplicates_methods as rdm - - -now_date = time.strftime("%Y-%m-%d", time.localtime()) #鑾峰彇褰撳墠骞存湀鏃� #url缂栫爜骞存湀鏃ュ紑濮嬮粯璁ゆ椂闂� -now_date1 = time.strftime("%Y-%m", time.localtime()) -month_begin=now_date1+'-01' #璁剧疆褰撳墠鏈堜唤鐨勫紑濮� - -list_temp=[] #涓存椂鍒楄〃 鍏ㄥ眬鍙橀噺 - - - -def remove_Duplicates_list(list): #鍒楄〃鑷韩鍘婚噸 - global already_spider_datanum - list_store=[] - for item in list: - if item not in list_store: - list_store.append(item) - else: - print("鍙戠幇閲嶅") - already_spider_datanum=already_spider_datanum-1 - #print(list_store) - return list_store - -def merge(list): #鍚堝苟list鍊掓暟鍏釜鍏冪礌 - date_1=str(list.pop(-1)) #鍒犻櫎灏惧厓绱犲悗杩樿兘缁х画浣跨敤鏀瑰厓绱狅紝 - date_2=str(list.pop(-1)) - date1=date_2+' '+date_1 #鍚堝苟涓哄勾鏈堟棩鏃跺垎绉� - - date_3=str(list.pop(-1)) - date_4=str(list.pop(-1)) - date2=date_4+' '+date_3 - - date_5=str(list.pop(-1)) - date_6=str(list.pop(-1)) - date3=date_6+' '+date_5 - list.append(date3) #灏嗗悎骞剁殑鏁版嵁鍐欎細list鍒楄〃缁撳熬. - list.append(date2) - list.append(date1) - - - return list - -def list_url(url,page_num): #url涓殑i鏄〉 ,apge_num琛ㄧず鐖彇鐨勯〉鏁� 銆倁rl鍚庨潰鍔犱笂椤电殑鍙傛暟 - urls = [url+'&page'+'={}'.format(str(i)) for i in range(1,page_num+1)] - return urls # 杩斿洖璇rl瀵瑰簲椤电殑鎵�鏈夐摼鎺ュ舰寮忥紝杩斿洖鍊间负鍒楄〃 - - -def get_OnePage(url,count): #鎶撳彇涓�椤电殑鏁版嵁,鏀惧叆list_data涓�.urls涓鸿璁块棶鐨勭綉椤靛湴鍧� - global ck - global list_temp #浣跨敤鍏ㄥ眬鍙橀噺 - - list_temp.clear() #娓呯┖涓存椂琛� - headers = { - # 姝ゅ娉ㄦ剰cookie锛岃鑷繁鎶撳彇 - "Cookie":ck, - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36", - } - r = requests.get(url=url, headers=headers, verify=False).text - soup = bs(r,'html.parser') - - list=[] #鍒涘缓鍒楄〃鏉ヤ繚瀛樼粨鏋� - tags = soup.find_all("tr") # 鍒楄〃鎵�鏈夎 - for tag in tags: # 姣忎釜tag鏄竴琛� - count=count+1 - element = tag.text # 鑾峰彇<tr>鏍囩鍐呮墍鏈夋枃鏈俊鎭� - element = element.strip() # 灏嗗瓧绗︿覆棣栧熬绌烘牸鍘婚櫎 - list1 = element.split(); # 浠ョ┖鏍间负鍒嗛殧灏嗗瓧绗︿覆鍙樹负鍒楄〃 - - del (list1[-2:]) #鍒楄〃鏈�鍚庝袱涓厓绱犱笉闇�瑕�,鍒犻櫎 - list1.insert(3,'') - list.append(list1) #list淇濆瓨鎵�鏈夎 - - #print(list) - - - list_data=[] #淇濆瓨鍚堝苟鏃ユ湡鐨勪竴椤垫暟鎹� - for i in list: - list_data.append(merge(i)) #宸插皢灏炬棩鏈熸暟鎹悎骞舵垚骞存湀鏃� 鏃跺垎绉� 姝ゆ椂褰㈡垚瀹屾暣鐨勬暟鎹�. - - del list_data[0] #鍒犻櫎鏂囧瓧琛ㄥご - count=count-1 #鍒犻櫎浜嗚〃澶�,鎬绘暟鎹殑琛屾暟鍑忎竴 - #list_removeD= remove_Duplicates_list(list_data) #list_date淇濆瓨鐨勬槸涓�椤电殑鏁版嵁 - - #print(list_data) - list_temp=list_data[:] - #list_temp=remove_Duplicates_list(list_data)[:] #灏嗕竴椤垫墍鏈夋暟鎹鍒剁粰涓存椂鍒楄〃list_temp 鏄幓闄ら噸澶嶅悗鐨勫垪琛� - return count - - - -def get_MorePages(url,page_num): #鐖彇鎸囧畾搴楅摵鍚嶇殑澶氶〉鏁版嵁,apge_num琛ㄧず鐖彇鐨勯〉鏁� - global sleeptime - global already_spider_datanum - urls=list_url(url,page_num) #寰楀埌闇�瑕侀亶鍘嗙殑椤电殑url - count_all=0 #淇濆瓨鏁版嵁鐨勬�昏鏁� - list_all=[] #淇濆瓨鐖彇鐨勬墍鏈夌殑鏁版嵁 - page=1 - for i in urls: - count=0 - count_all=count_all+get_OnePage(i,count) - if len(list_temp)==0: #濡傛灉璇ラ〉涓虹┖锛屽垯琛ㄧず璇ラ〉鍚庨潰閮芥棤鏁版嵁 閫�鍑哄惊鐜� - print('鍚庨潰椤垫暟涓虹┖锛岀埇鍘讳笅涓�涓簵閾�') - break #閫�鍑哄惊鐜� - list_all.extend(list_temp) #灏嗕竴椤垫暟鎹垪琛ㄨ拷鍔犲埌list_all涓� - print("鐖彇浜嗙",page,"椤�") - page=page+1 - print("\n") - time.sleep(sleeptime) #闂撮殧2绉掕姹備竴娆� - - for j in list_all: - print(j) #鎵撳嵃鍒楄〃涓瘡涓�琛� - print("鎬昏鏁颁负:",count_all) - already_spider_datanum += count_all #宸茬埇鍙栨暟鎹殑鎬诲拰 - - return list_all - #return remove_Duplicates_list(list_all) #鍐嶆瀵瑰垪琛ㄨ繃婊ら噸澶� - - -def url_more(): #杩斿洖鏂囦欢涓摵鍚嶇紪鐮佸舰鎴恥rl,杩斿洖鍊兼槸url鍒楄〃 榛樿鏌ョ湅缃戦〉鐨勬渶澶ф樉绀烘潯鏁�100 - global shopnum - shopnames = [] #淇濆瓨涓枃搴楅摵鍚嶇О - with open("D:\\z\\workplace\\shopname.txt",encoding='utf-8') as file: #灏嗘枃浠朵腑搴楅摵鍚嶅瓧淇濆瓨鍒板垪琛ㄤ腑 - for line in file: - line = line.strip() #or some other preprocessing - shopnames.append(line) #storing everything in memory! - #print(type(shopnames[0])) - #缂栫爜 - shopnum=len(shopnames) #鏂囦欢涓簵閾烘�绘暟 - shopname_encoding=[] #淇濆瓨缂栫爜鍚庣殑搴楅摵鍚嶇О - i=0 - for name in shopnames: - shopname_encoding.append(urllib.parse.quote(urllib.parse.quote(shopnames[i]))) #搴楅摵鍚嶇О杩涜鍙岄噸url缂栫爜 - i=i+1 - #鎷兼帴缃戝潃褰㈡垚鍙敤鐨剈rl - urls=[] #淇濆瓨鎷兼帴鍚庣殑url - for shop in shopname_encoding: - url='http://xhhb.senzly.cn/sys/yyRealTimeValue_list.jsp?key1=&shop='+shop+'&pagesize=100' - urls.append(url) - # for i in urls: - # print(i) - return urls #杩斿洖鏂囦欢涓簵閾哄悕绉板搴旂殑url - -#鏍规嵁寮�濮嬪拰缁撴潫鏃ユ湡鏉ユ嫾鎺rl -def url_add_time(url,date_begin=month_begin,date_end=now_date): #url,骞�-鏈�-鏃� 2023-05-03 - url_date=url+'&key5='+date_begin+'&key6='+date_end - print(url_date) - return url_date - -#------------------------------------------------------------------------------------------------------------瓒呮爣娌圭儫鏁版嵁鍐欏叆寮傚父琛ㄤ腑 -#涓ゆ椂闂存槸鍚︾浉宸�10鍒嗛挓 鏄垯杩斿洖TRUE 鍚﹀垯杩斿洖FALSE -def is_time_difference_equals_10_mins(datestr1, datestr2): - date1 = datetime.strptime(datestr1, "%Y-%m-%d %H:%M") - date2 = datetime.strptime(datestr2, "%Y-%m-%d %H:%M") - time_diff = date2 - date1 - - return time_diff == timedelta(minutes = 10) or time_diff == timedelta(minutes = -10) #timedelta() 琛ㄧず涓や釜 date 瀵硅薄鎴栬�� time 瀵硅薄,鎴栬�� datetime 瀵硅薄涔嬮棿鐨勬椂闂撮棿闅� - - -#姣忛殧鍗佸垎閽熶竴娆′负姝e父銆� 鎵惧嚭瓒呰繃10鍒嗛挓鐨勯棿鏂偣 -def find_break_point(list): #list涓鸿秴鏍囨暟鎹殑鍒楄〃 - i=0 - j=1 - break_point = [] #淇濆瓨闂存柇鐐� - for item in list[1:]: - if(is_time_difference_equals_10_mins(list[i][2],item[2]) == False): - break_point.append(j) - i=i+1 - j=j+1 - print('闂存柇鐐逛负锛�') - print(break_point) - - #鍐欏叆闂存柇鐐� - return break_point - - - -#鏍规嵁闂存柇鐐瑰皢鍒楄〃鍒嗗壊鎴愬嚑涓瓙鍒楄〃锛岀敱result杩斿洖 -def point_write(list,b_point): #list涓哄垪琛ㄣ�俠_point鍒楄〃鍏冪礌涓洪棿鏂偣锛岄棿鏂偣鍊间粠灏忓埌澶� - result = [] - last_index = 0 - for index in b_point: - result.append(list[last_index:index]) #鐏垫椿 - last_index=index - result.append(list[last_index:]) - return result - - -#灏嗚澶囨晠闅滀俊鎭啓鍏bnormal_data寮傚父琛ㄤ腑 -def abnormal_write_to_SQL(list,con): - data = pd.DataFrame(list,columns=['dev_id','exception','exception_type','region','begin_time','end_time']) - print("\n\n") - print(data) - # engine = create_engine("mysql+mysqlconnector://root:1234@localhost:3306/qianduan_sql?charset=utf8") - # con = engine.connect() - - # test3 瑕佸啓鍏ョ殑鏁版嵁琛紝杩欐牱鍐欑殑璇濊鎻愬墠鍦ㄦ暟鎹簱寤哄ソ琛� - data.to_sql(name="abnormal_data", con=con, if_exists="append",index=False,index_label=False) - # con.close() - - - -def exception(list,con): #list涓鸿秴鏍囨暟鎹殑鍒楄〃 - break_point=find_break_point(list) #杩斿洖闂存柇鐐� - split_list=point_write(list,break_point) #鏍规嵁闂存柇鐐瑰皢鍘熷鍒楄〃鍒嗗壊鎴愬嚑涓瓙鍒楄〃 split_list涓轰笁灞傛暟缁�,褰㈠紡涓篬[[1,2],[4,'g']],[[8,'2'],['4','g']],[[1,2],[4,'g']]] - # print('瓒呮爣鏃堕棿娈靛垝鍒嗘垚鐨勫瓙鍒楄〃涓猴細锛�') - # for i in split_list: - # print(i) - print('\n') - abnormal=[] #閲嶇粍濂界殑寮傚父琛ㄦ暟鎹� - - for item in split_list: #浠庡垎鍓茬殑鏁扮粍涓彁鍙栭渶瑕佺殑鏃堕棿淇℃伅锛屽苟娣诲姞鏂扮殑淇℃伅鏁版嵁 - temp=[] - temp.append(item[0][0]) #璁惧缂栧彿 - temp.append('鏁版嵁寮傚父') #璁惧缂栧彿 - temp.append('0') #娌圭儫娴撳害瓒呮爣 - temp.append('寰愭眹鍖�') - temp.append(item[len(item)-1][2]) #鍓嶄竴鏉¤褰曠殑褰掑睘鏃堕棿 寮�濮嬫椂闂� - temp.append(item[0][2]) #褰掑睘鏃堕棿 缁撴潫鏃堕棿 - abnormal.append(temp) - - print(abnormal) - - print('瓒呮爣寮傚父鏃堕棿娈垫暟鎹负锛�') - for j in abnormal: - print(j) - abnormal_write_to_SQL(abnormal,con) #鍐欏叆寮傚父琛ㄤ腑 - print("瓒呮爣娌圭儫鏁版嵁寮傚父琛ㄥ啓鍏ュ畬鎴�!") - -#------------------------------------------------------------------------------------------------------------璁惧鏁呴殰鏁版嵁鍐欏叆寮傚父琛ㄤ腑 -#涓ゆ椂闂存槸鍚︾浉宸�30鍒嗛挓 鏄垯杩斿洖TRUE 鍚﹀垯杩斿洖FALSE -def is_time_difference_equals_30_mins(datestr1, datestr2): - date1 = datetime.strptime(datestr1, "%Y-%m-%d %H:%M") - date2 = datetime.strptime(datestr2, "%Y-%m-%d %H:%M") - time_diff = date2 - date1 - return time_diff > timedelta(minutes=30) - -#鎵惧嚭璁惧鏁呴殰鐨勪俊鎭紝骞跺皢姝や俊鎭啓鍏ュ紓甯歌〃涓� -def is_minutes_exceed_30(list,con) : # list涓烘煇搴楅摵鎸囧畾椤垫暟鐨勫叏閮ㄧ殑璁板綍 list鍏冪礌涓殑鏃堕棿涓哄�掑簭鎺掑垪锛屽嵆浠庡ぇ鍒板皬 - device_failure=[] #瀛樺偍璁惧鏁呴殰鐨勬暟鎹� - startTime = list[0][11] - print('寮�濮嬫椂闂达細',startTime) - for item in list[1:] : - if is_time_difference_equals_30_mins(item[11],startTime) : #蹇呴』澶т簬30鍒嗛挓 涓嶈兘绛変簬30鍒嗛挓 - temp=[] - temp.append(item[2]) #璁惧缂栧彿 - temp.append('璁惧鏁呴殰') #璁惧缂栧彿 - temp.append('1') #璁惧鏁呴殰 - temp.append('寰愭眹鍖�') - temp.append(item[11]) #鏁呴殰寮�濮嬫椂闂� - startTimeSub= datetime.strptime(startTime,"%Y-%m-%d %H:%M") - timedelta(minutes = 10) #缁撴灉涓篸atetime.datetime绫诲瀷 锛岄渶瑕佸啀杞负瀛楃涓茬被鍨� - print('鐩稿噺鍚庣粨鏋滐細',str(startTimeSub)) - print('鐩稿噺鍚庣被鍨嬶細',type(str(startTimeSub))) - temp.append(str(startTimeSub)[:16]) #鏁呴殰缁撴潫鏃堕棿 - device_failure.append(temp) - startTime = item[11] - print('璁惧鏁呴殰鐨勬暟鎹负锛�') - for i in device_failure : - print(i) - not_Key_period_exceed_30_minutes(device_failure,con) #灏嗕緵鐢靛紓甯镐俊鎭啓鍏ュ紓甯歌〃 - #abnormal_write_to_SQL(device_failure,con) #灏嗚澶囨晠闅滀俊鎭啓鍏ュ紓甯歌〃 - print('渚涚數寮傚父/鎺夌嚎淇℃伅鍐欏叆寮傚父琛ㄥ畬鎴�!') -#-----------------------------------------------------------------------------------------------------------渚涚數寮傚父鏁版嵁鍐欏叆寮傚父琛ㄤ腑 -#寮�濮嬪拰缁撴潫鏃堕棿閮藉浜庨潪閲嶇偣鏃舵鏃�,杩斿洖true -def is_time_not_between_key_period(begin_time,end_time) : #褰㈠弬涓烘棩鏈熷瓧绗︿覆,褰㈠ '2023-06-21 14:30' - global Key_period_noon_begin,Key_period_noon_end,Key_period_night_begin,Key_period_night_end - # #涓崍閲嶇偣鏃舵 - # Key_period_noon_begin = datetime.strptime('10:00',"%H:%M") - # Key_period_noon_end = datetime.strptime('14:00',"%H:%M") - - # #鏅氫笂閲嶇偣鏃舵 - # Key_period_night_begin = datetime.strptime('17:00',"%H:%M") - # Key_period_night_end = datetime.strptime('21:00',"%H:%M") - - begin1 = datetime.strptime(begin_time[11:],"%H:%M") - end1 = datetime.strptime(end_time[11:],"%H:%M") - - #褰撳紑濮嬪拰缁撴潫鏃堕棿閮藉浜庨潪閲嶇偣鏃舵鏃讹紝灏嗚鏉℃晠闅滀俊鎭悓鏃惰褰曚负锛� 鐤戜技渚涚數寮傚父 - if ((( begin1 > Key_period_noon_begin and begin1 < Key_period_noon_end ) or ( begin1 > Key_period_night_begin and begin1 < Key_period_night_end )) or (( end1 > Key_period_noon_begin and end1 < Key_period_noon_end ) or ( end1 > Key_period_night_begin and end1 < Key_period_night_end ))) ==False : - print('寮�濮嬫垨缁撴潫鏃堕棿鏃堕棿鍦ㄩ潪閲嶇偣鏃舵') - return True - print('澶勪簬閲嶇偣鏃舵') - return False - -#寮�濮嬪拰缁撴潫鏃堕棿閮藉浜庨噸鐐规椂娈垫椂,杩斿洖true -def is_time_between_key_period(begin_time,end_time) : #褰㈠弬涓烘棩鏈熷瓧绗︿覆,褰㈠ '2023-06-21 14:30' - global Key_period_noon_begin,Key_period_noon_end,Key_period_night_begin,Key_period_night_end - # #涓崍閲嶇偣鏃舵 - # Key_period_noon_begin = datetime.strptime('10:00',"%H:%M") - # Key_period_noon_end = datetime.strptime('14:00',"%H:%M") - - # #鏅氫笂閲嶇偣鏃舵 - # Key_period_night_begin = datetime.strptime('17:00',"%H:%M") - # Key_period_night_end = datetime.strptime('21:00',"%H:%M") - - begin1 = datetime.strptime(begin_time[11:],"%H:%M") - end1 = datetime.strptime(end_time[11:],"%H:%M") - - #褰撳紑濮嬪拰缁撴潫鏃堕棿閮藉浜庨噸鐐规椂娈垫椂锛屽皢璇ユ潯鏁呴殰淇℃伅鍚屾椂璁板綍涓猴細 鎺夌嚎 - if ((begin1 > Key_period_noon_begin and begin1 < Key_period_noon_end) and ( end1 > Key_period_noon_begin and end1 < Key_period_noon_end )) or ( (begin1 > Key_period_night_begin and begin1 < Key_period_night_end) and ( end1 > Key_period_night_begin and end1 < Key_period_night_end )) : - print('寮�濮嬫垨缁撴潫鏃堕棿澶勪簬閲嶇偣鏃舵') - return True - print('澶勪簬闈為噸鐐规椂娈�') - return False - - - -def not_Key_period_exceed_30_minutes(list,con) : #list涓鸿澶囨晠闅滅殑鏃堕棿娈垫暟鎹� - power_supply_abnormal = [] #淇濆瓨渚涚數寮傚父鎴栨帀绾跨殑淇℃伅 - for item in list : - if is_time_not_between_key_period(item[4],item[5]) : #else: - temp = [] - temp.append(item[0]) - temp.append('璁惧鏁呴殰') - temp.append('1') #鐤戜技渚涚數寮傚父 - temp.append('寰愭眹鍖�') - temp.append(item[4]) - temp.append(item[5]) - power_supply_abnormal.append(temp) - elif is_time_between_key_period(item[4],item[5]) : - temp = [] - temp.append(item[0]) - temp.append('璁惧鏁呴殰') - temp.append('2') #鎺夌嚎 - temp.append('寰愭眹鍖�') - temp.append(item[4]) - temp.append(item[5]) - power_supply_abnormal.append(temp) - print('渚涚數寮傚父鐨勬暟鎹负锛�') - for i in power_supply_abnormal : - print(i) - - #灏嗕緵鐢靛紓甯哥殑淇℃伅鍐欏叆鏁版嵁搴撳紓甯歌〃涓� - abnormal_write_to_SQL(power_supply_abnormal,con) #灏嗚澶囨晠闅滀俊鎭啓鍏ュ紓甯歌〃 - print('渚涚數寮傚父鐨勪俊鎭啓鍏ュ紓甯歌〃瀹屾垚!') - - - -#------------------------------------------------------------------------------------------------------------鍐欏叆瓒呮爣琛ㄤ腑 - -#杩斿洖閲嶇粍鍚庣殑鍒楄〃 -def refind_ex(list): #list涓虹綉椤电殑涓�鏉¤褰� - temp=[] - temp.append(list[2]) #璁惧缂栧彿 - temp.append(list[12]) #涓婃姤鏃堕棿 - temp.append(list[11]) #褰掑睘鏃堕棿 - temp.append(list[6]) #椋庢満鐢垫祦 6 - temp.append(list[7]) #鍑�鍖栧櫒鐢垫祦7 - temp.append(list[4]) #杩涙补鐑熸祿搴﹀�� - temp.append(list[5]) #鎺掓补鐑熸祿搴﹀�� - - print(temp) - return temp - - -#灏嗗垪琛ㄥ啓鍏xceeding_st_data琛ㄤ腑 -def ex_write_to_SQL(list,con): - data = pd.DataFrame(list,columns=['MV_Stat_Code','MV_Create_Time','MV_Data_Time','MV_Fan_Electricity','MV_Purifier_Electricity','MV_Fume_Concentration','MV_Fume_Concentration2']) - print("\n\n") - print(data) - #engine = create_engine("mysql+mysqlconnector://root:1234@localhost:3306/qianduan_sql?charset=utf8") - #con = engine.connect() - - # test3 瑕佸啓鍏ョ殑鏁版嵁琛紝杩欐牱鍐欑殑璇濊鎻愬墠鍦ㄦ暟鎹簱寤哄ソ琛� - data.to_sql(name="exceeding_st_data", con=con, if_exists="append",index=False,index_label=False) - #con.close() - print("瓒呮爣琛ㄥ啓鍏ュ畬鎴�!") - - -# list涓烘煇搴楅摵鎸囧畾椤垫暟鐨勫叏閮ㄧ殑璁板綍 灏嗚秴鏍囨暟鎹啓鍏ヨ秴鏍囪〃 -def isExceeding(list,con): #list涓烘煇搴楅摵鎸囧畾椤垫暟鐨勫叏閮ㄧ殑璁板綍 list鍏冪礌涓哄垪琛ㄥ舰寮� - exceedingData=[] #淇濆瓨瓒呮爣鐨勬暟鎹� - for item in list: #鏌ユ壘瓒呮爣鐨勬暟鎹紝骞惰褰曚笅 - if float(item[5]) > 1: # 鎺掔儫娴撳害澶т簬1鍒欒秴鏍� - print("璇ユ潯鏁版嵁瓒呮爣") - #淇濆瓨璇ユ潯璁板綍锛屾彁鍙栭渶瑕佺殑鍊硷紝骞舵坊鍔犲叾浠栧瓧娈� - exceedingData.append(refind_ex(item)) - - - for i in exceedingData: #閬嶅巻鍒楄〃 - print(i) - - if(len(exceedingData) != 0) : #鏈夎秴鏍囨暟鎹椂鎵嶆墽琛� - #灏嗚秴鏍囨暟鎹椂闂村垎绫诲啀鍐檃bnormal_data寮傚父琛ㄤ腑 - exception(exceedingData,con) - - #灏嗚秴鏍囨暟鎹洿鎺ュ啓鍏ユ暟鎹簱瓒呮爣琛ㄤ腑 - ex_write_to_SQL(exceedingData,con) - else: - print('璇ュ簵閾烘棤瓒呮爣鏁版嵁') - - -#------------------------------------------------------------------------------------------------------------鏁版嵁鍐欏叆璁惧淇℃伅琛� -def generate_short_uuid(): - arrayOf=[ - "a", - "b", - "c", - "d", - "e", - "f", - "g", - "h", - "i", - "j", - "k", - "l", - "m", - "n", - "o", - "p", - "q", - "r", - "s", - "t", - "u", - "v", - "w", - "x", - "y", - "z", - "0", - "1", - "2", - "3", - "4", - "5", - "6", - "7", - "8", - "9", - "A", - "B", - "C", - "D", - "E", - "F", - "G", - "H", - "I", - "J", - "K", - "L", - "M", - "N", - "O", - "P", - "Q", - "R", - "S", - "T", - "U", - "V", - "W", - "X", - "Y", - "Z" - ] - list=[] - ui=str(uuid.uuid4()).replace('-', '') - for i in range(0,16): - a1=ui[i*2:i*2+2] - x=int(a1,16) - list.append(arrayOf[x % 0x3E]) - return ''.join(list) - - -#杩斿洖閲嶇粍鍚庣殑鍒楄〃 -def refind_ea(list): #涓�鏉¤褰曪紝涔熷氨鏄竴涓垪琛� - temp=[] - temp.append(generate_short_uuid()) - temp.append(list[2]) - temp.append(list[1]) - temp.append(list[0]) - temp.append(1) - print(temp) - return temp - -#灏嗗垪琛ㄥ啓鍏ヨ澶囦俊鎭澶囦俊鎭痚a_t_dev琛ㄤ腑 -def ea_write_to_SQL(list,con): - data = pd.DataFrame(list,columns=['DI_GUID','DI_Code','DI_Name','DI_Supplier','DI_Online']) - print("\n\n") - print('鍐欏叆鏁版嵁琛� 锛孌ateFrame涓猴細',data) - - # test3 瑕佸啓鍏ョ殑鏁版嵁琛紝杩欐牱鍐欑殑璇濊鎻愬墠鍦ㄦ暟鎹簱寤哄ソ琛� - data.to_sql(name="ea_t_device_info", con=con, if_exists="append",index=False,index_label=False) - print("璁惧淇℃伅琛ㄥ啓鍏ュ畬鎴�!") - - -def dev_info_data_if_exisitd(list,con): #list涓虹埇鍙栨煇瀹跺簵閾烘寚瀹氶〉鏁拌浆鎹㈠悗鐨勬暟鎹� - global con_read - #鍒涘缓绗簩涓暟鎹簱杩炴帴 - # engine = create_engine("mysql+mysqlconnector://root:1234@localhost:3306/qianduan_sql?charset=utf8") - # con_read = engine.connect() - - df = pd.read_sql('SELECT DI_Code,DI_Name,DI_Supplier FROM ea_t_device_info',con=con_read) #浠庤澶囦俊鎭〃涓鍙栬澶囩紪鍙凤紝搴楅摵鍚嶏紝渚涘簲鍟嗗瓧娈电殑鏁版嵁銆傝繑鍥炲�兼槸DateFrame绫诲瀷 - # con_read.close() #鍏抽棴閾炬帴 - - res = df.values.tolist() #DateFrame鎸夌収琛岃浆鎴恖ist绫诲瀷锛宺es瀛樻斁鐨勬槸璁惧淇℃伅琛ㄤ腑鐨勬暟鎹� - print('******** 璁惧淇℃伅******') - for i in res: - print(i) - print('璁惧淇℃伅琛ㄨ褰曟潯鏁颁负锛�',len(res)) - - list1 = rdm.remove_duplicates_dev_info(list) #璁惧缂栧彿锛屽簵閾哄悕锛屼緵搴斿晢鐩哥瓑鏃讹紝鍒欎负閲嶅锛屽幓闄ゃ�俵ist1涓哄幓閲嶅悗鐨� - if len(res) > 0 : #璁惧琛ㄤ腑鏈夋暟鎹� - #姣旇緝 - temp=list1[:] #灏唋ist1鏁版嵁缁檛emp锛岄亶鍘唗emp,鑻ョ浉绛夛紝浠巐ist涓垹闄ゆ暟鎹紝閬垮厤涓�涓垪琛ㄥ悓鏃堕亶鍘嗕笖鍒犻櫎 - print('鍘婚櫎閲嶅涓�:') - print(list1) - for item in temp: - if item[1:4] in ( x[:] for x in res ) : #寰呭瓨鍏ユ暟鎹簱鐨勫�间笌璁惧琛ㄤ腑鏁版嵁鐩哥瓑鏃�,灏嗗緟瀛樺叆鐨勫�间粠list涓Щ闄� - list1=rdm.remove_given_data_dev_info(list1,item[1:4]) #璇tem浠巐ist1涓Щ闄� - - print('璁惧淇℃伅琛ㄤ腑鏈夋暟鎹椂锛屽幓閲嶅悗鐨刲ist涓猴細',list1) - if( len(list1) != 0 ) : #鍒犻櫎鍚庝笉涓虹┖鏃讹紝鍐欏叆 - ea_write_to_SQL(list1,con) #灏嗗垪琛ㄥ啓鍏a_t_dev琛ㄤ腑 - else : #璁惧琛ㄤ腑鏃犳暟鎹� - # a=rdm.remove_duplicates_dev_info(list) #璁惧缂栧彿锛屽簵閾哄悕锛屼緵搴斿晢鐩哥瓑鏃讹紝鍒欎负閲嶅锛屽幓闄� - print('璁惧琛ㄦ棤鏁版嵁锛屽鐞嗗悗寰呭啓鍏ョ殑璁惧淇℃伅涓猴細',list1) - #灏嗗幓閲嶅悗鏁版嵁鍐欏叆璁惧淇℃伅琛� - ea_write_to_SQL(list1,con) #灏嗗垪琛ㄥ啓鍏ヨ澶囪〃涓� 銆� 绗竴涓弬鏁帮細璁惧缂栧彿锛屽簵閾哄悕锛屼緵搴斿晢鐩哥瓑鏃讹紝鍒欎负閲嶅锛屽幓闄� - - - -#灏嗗師濮嬫暟鎹浆鍖栨垚鏂扮殑鍒楄〃锛屽啀鍐欏叆璁惧淇℃伅璁惧淇℃伅琛ㄤ腑 /瀛樺叆 -def ea_t_dev(list,con): #鏌愬搴楅摵鐨勫埗瀹氶〉鐨勬暟鎹褰� 锛宭ist鍒楄〃鍏冪礌渚濈劧涓哄垪琛紝姣斿[[1,2,3,'a'],[52,3,'a'],[6,2,3,'a']] 锛宑on涓烘暟鎹簱鐨勫缓绔� - staging=[] #琛ㄧず杞崲鍚庣殑鍒楄〃 - for item in list: - #鎻愬彇闇�瑕佺殑鍊硷紝骞舵坊鍔犲叾浠栧瓧娈� - staging.append(refind_ea(item)) #杞寲 - print('璁惧鏁版嵁杞寲鍚庯細') - for i in staging: - print(i) - - #鏌ヨ璁惧琛ㄥ凡瀛樼殑鏁版嵁锛岃嫢宸插瓨鍦ㄨ澶囦俊鎭紝鍒欎笉鍐欏叆 - dev_info_data_if_exisitd(staging,con) - - -#----------------------------------鍐欏叆鍒嗛挓鏁版嵁琛� - -#杩斿洖閲嶇粍鍚庣殑鍒楄〃 -def refind_fd(list): #涓�鏉¤褰曪紝涔熷氨鏄竴涓垪琛� - temp=[] - temp.append(list[2]) #璁惧缂栧彿 - temp.append(list[12]) #涓婃姤鏃堕棿 - temp.append(list[11]) #褰掑睘鏃堕棿 - temp.append(list[6]) #椋庢満鐢垫祦 6 - temp.append(list[7]) #鍑�鍖栧櫒鐢垫祦 7 - temp.append(list[4]) #杩涙补鐑熸祿搴﹀�� - temp.append(list[5]) #鎺掓补鐑熸祿搴﹀�� - - print(temp) - return temp - - -#灏嗗垪琛ㄥ啓鍏ュ垎閽熸暟鎹〃涓� -def fd_write_to_SQL(list,con): - data = pd.DataFrame(list,columns=['MV_Stat_Code','MV_Create_Time','MV_Data_Time','MV_Fan_Electricity','MV_Purifier_Electricity','MV_Fume_Concentration','MV_Fume_Concentration2']) - print("鍐欏叆鍒嗘暟鏁版嵁琛�,DateFrame涓猴細") - print(data) - - # test3 瑕佸啓鍏ョ殑鏁版嵁琛紝杩欐牱鍐欑殑璇濊鎻愬墠鍦ㄦ暟鎹簱寤哄ソ琛� - data.to_sql(name="fd_t_minutevalue", con=con, if_exists="append",index=False,index_label=False) - - print("鍒嗛挓鏁版嵁琛ㄥ啓鍏ュ畬鎴�!") - -#杞寲 鍐嶅啓鍏d_t_minbute琛ㄤ腑 -def fd_t_minbute(list,con): #涓�椤电殑鏁版嵁璁板綍 锛宑on涓烘暟鎹簱鐨勫缓绔� - staging=[] #淇濆瓨杞崲鍚庣殑鍒楄〃 - for item in list: - #鎻愬彇闇�瑕佺殑鍊硷紝骞舵坊鍔犲叾浠栧瓧娈� - staging.append(refind_fd(item)) - print('鍒嗛挓鏁版嵁杞寲鍚庯細') - for i in staging: - print(i) - fd_write_to_SQL(staging,con) #灏嗗垪琛ㄥ啓鍏a_t_dec琛ㄤ腑 - - -#--------------------------------------------------------------------------------------------------------------椋熷叾瀹� -def get_OnePage_teshu_shiqijia(url,count): - global ck - global list_temp #浣跨敤鍏ㄥ眬鍙橀噺 - - - list_temp.clear() #娓呯┖涓存椂琛� - headers = { - # 姝ゅ娉ㄦ剰cookie锛岃鑷繁鎶撳彇 - "Cookie":ck, - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36", - } - r = requests.get(url=url, headers=headers, verify=False).text - soup = bs(r,'html.parser') - - list=[] #鍒涘缓鍒楄〃鏉ヤ繚瀛樼粨鏋� - - tags = soup.find_all("tr") # 鍒楄〃鎵�鏈夎 - for tag in tags: # 姣忎釜tag鏄竴琛� - count=count+1 - element = tag.text # 鑾峰彇<tr>鏍囩鍐呮墍鏈夋枃鏈俊鎭� - element = element.strip() # 灏嗗瓧绗︿覆棣栧熬绌烘牸鍘婚櫎 - list1 = element.split(); # 浠ョ┖鏍间负鍒嗛殧灏嗗瓧绗︿覆鍙樹负鍒楄〃 - - del (list1[-2:]) #鍒楄〃鏈�鍚庝袱涓厓绱犱笉闇�瑕�,鍒犻櫎 - print('鍒犻櫎鐗规畩鐨勫悗涓や釜') - print(list1) - - str_temp1=list1[4]+list1[5] #绗�5,6鍚堝苟涓や釜鍏冪礌涓轰竴涓� - print(str_temp1) - del list1[5] - list1[4]=str_temp1 - print("鍏冪礌鍚堝苟瀹屾垚") - print(list1) - - str_temp2=list1[1]+list1[2] #绗簩涓夊厓绱犲悎骞跺畬鎴� - del list1[2] - list1[1]=str_temp2 - - list.append(list1) - print("鏈�缁堣鏁版嵁") - print(list1) - #list1.clear() - - #print(list) - list_data=[] - for i in list: #宸插皢灏炬棩鏈熸暟鎹悎骞舵垚骞存湀鏃� 鏃跺垎绉� - list_data.append(merge(i)) - del list_data[0] #鍒犻櫎鏂囧瓧琛ㄥご - count=count-1 #鍒犻櫎浜嗚〃澶�,鎬绘暟鎹殑琛屾暟鍑忎竴 - #print(list_data) - #list_temp=remove_Duplicates_list(list_data)[:] #灏嗘墍鏈夋暟鎹鍒剁粰涓存椂鍒楄〃list_temp 鏄幓闄ら噸澶嶅悗鐨勫垪琛� - list_temp=list_data[:] - return count - - -def get_MorePages_teshu_shiqijia(url,page_num): - global sleeptime - global already_spider_datanum - urls=list_url(url,page_num) #寰楀埌闇�瑕侀亶鍘嗙殑椤电殑url - count_all=0 #淇濆瓨鏁版嵁鐨勬�昏鏁� - list_all=[] #淇濆瓨鐖彇鐨勬墍鏈夌殑鏁版嵁 - page=1 - for i in urls: - count=0 - count_all=count_all+get_OnePage_teshu_shiqijia(i,count) - if len(list_temp)==0: #濡傛灉璇ラ〉涓虹┖锛屽垯琛ㄧず璇ラ〉鍚庨潰閮芥棤鏁版嵁 閫�鍑哄惊鐜� - print('鍚庨潰椤垫暟涓虹┖锛岀埇鍘讳笅涓�涓簵閾�') - break - list_all.extend(list_temp) #灏嗗垪琛ㄨ拷鍔犲埌list_all涓� - print("鐖彇浜嗙",page,"椤�") - page=page+1 - print("\n") - time.sleep(sleeptime) #闂撮殧2绉掕姹備竴娆� - - for j in list_all: - print(j) #鎵撳嵃鍒楄〃涓瘡涓�琛� - print("鎬昏鏁颁负:",count_all) - already_spider_datanum += count_all #宸茬埇鍙栨暟鎹殑鎬诲拰 - return list_all - - - -#-------------------------------------------------------------------------------------------------------------鐗规畩鐨剈rl -def get_OnePage_teshu(url,count): #鎶撳彇涓�椤电殑鏁版嵁,鏀惧叆list_data涓�.urls涓鸿璁块棶鐨勭綉椤靛湴鍧� - global ck - global list_temp #浣跨敤鍏ㄥ眬鍙橀噺 - - list_temp.clear() #娓呯┖涓存椂琛� - headers = { - # 姝ゅ娉ㄦ剰cookie锛岃鑷繁鎶撳彇 - "Cookie":ck, - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36", - } - r = requests.get(url=url, headers=headers, verify=False).text - soup = bs(r,'html.parser') - - list=[] #鍒涘缓鍒楄〃鏉ヤ繚瀛樼粨鏋� - - tags = soup.find_all("tr") # 鍒楄〃鎵�鏈夎 - for tag in tags: # 姣忎釜tag鏄竴琛� - count=count+1 - element = tag.text # 鑾峰彇<tr>鏍囩鍐呮墍鏈夋枃鏈俊鎭� - element = element.strip() # 灏嗗瓧绗︿覆棣栧熬绌烘牸鍘婚櫎 - list1 = element.split(); # 浠ョ┖鏍间负鍒嗛殧灏嗗瓧绗︿覆鍙樹负鍒楄〃 - - del (list1[-2:]) #鍒楄〃鏈�鍚庝袱涓厓绱犱笉闇�瑕�,鍒犻櫎 - print('鍒犻櫎鐗规畩鐨勫悗涓や釜') - print(list1) - list.append(list1) - #list1.clear() - - #print(list) - list_data=[] - for i in list: - list_data.append(merge(i)) #灏嗗熬鏃ユ湡鏁版嵁鍚堝苟鎴愬勾鏈堟棩 鏃跺垎绉� - del list_data[0] #鍒犻櫎鏂囧瓧琛ㄥご - count=count-1 #鍒犻櫎浜嗚〃澶�,鎬绘暟鎹殑琛屾暟鍑忎竴 - #print(list_data) - #list_temp=remove_Duplicates_list(list_data)[:] #灏嗘墍鏈夋暟鎹鍒剁粰涓存椂鍒楄〃list_temp 鏄幓闄ら噸澶嶅悗鐨勫垪琛� - list_temp=list_data[:] - return count - - -def get_MorePages_teshu(url,page_num): #鐖彇鎸囧畾搴楅摵鍚嶇殑澶氶〉鏁版嵁,pge_num琛ㄧず鐖彇鐨勯〉鏁� - global sleeptime - global already_spider_datanum - urls=list_url(url,page_num) #寰楀埌闇�瑕侀亶鍘嗙殑椤电殑url 杩斿洖璇rl瀵瑰簲椤电殑鎵�鏈夐摼鎺ュ舰寮忥紝杩斿洖鍊间负鍒楄〃 - count_all=0 #淇濆瓨鏁版嵁鐨勬�昏鏁� - list_all=[] #淇濆瓨鐖彇鐨勬墍鏈夌殑鏁版嵁 - page=1 - for i in urls: - count=0 - count_all=count_all+get_OnePage_teshu(i,count) - if len(list_temp)==0: #濡傛灉璇ラ〉涓虹┖锛屽垯琛ㄧず璇ラ〉鍚庨潰閮芥棤鏁版嵁 閫�鍑哄惊鐜� - print('鍚庨潰椤垫暟涓虹┖锛岀埇鍘讳笅涓�涓簵閾�') - break - list_all.extend(list_temp) #灏嗗垪琛ㄨ拷鍔犲埌list_all涓� - print("鐖彇浜嗙",page,"椤�") - page=page+1 - print("\n") - time.sleep(sleeptime) #闂撮殧2绉掕姹備竴娆� - - for j in list_all: - print(j) #鎵撳嵃鍒楄〃涓瘡涓�琛� - print("鎬昏鏁颁负:",count_all) - already_spider_datanum += count_all #宸茬埇鍙栨暟鎹殑鎬诲拰 - return list_all - - -def spilt_url_teshu(con,page,date_begin=month_begin,date_end=now_date): #鍏堝鐗规畩鐨剈rl鍋氬鐞�,鍐嶈繃婊� - global already_spider_shopnum - urls=url_more() #杩斿洖鏂囦欢涓墍鏈夊簵閾虹殑url,甯︽渶澶ф樉绀烘潯鏁�100 銆倁rls鏄垪琛� - #print(urls) - teshu_url=[] - #'shop=%25E9%25A3%259F%25E5%2585%25B6%25E5%25AE%25B6' 椋熷叾瀹� - special_url=['shop=%25E4%25BA%25BA%25E7%2594%259F%25E4%25B8%2580%25E4%25B8%25B2','shop=%25E7%25BC%2598%25E5%25AE%25B6','shop=%25E4%25B8%25B0%25E8%258C%2582%25E7%2583%25A4%25E4%25B8%25B2','shop=%25E6%25B3%25B0%25E7%2585%258C%25E9%25B8%25A1','shop=%25E5%25B0%258F%25E9%2593%2581%25E5%2590%259B'] - - for url in urls: #閬嶅巻鎵�鏈夊簵閾虹殑url - begin=url.find('&')+1 - end=url.rfind('&') - #print(begin,end) - #鎵惧埌鐗规畩鐨剈rl,杩涜鐗规畩澶勭悊 - if url[begin:end] in special_url: - print('鍙戠幇鐗规畩鐨勶紒') - already_spider_shopnum += 1 #鐖幓鐨勫簵閾烘暟閲忓姞1 - teshu_url.append(url) - #鍐嶅垹闄や竴涓垪琛ㄥ厓绱� - url_teshu=url_add_time(url,date_begin,date_end) #缁欐墍鏈塽rl鍔犱笂鏃ユ湡 - list_to_MySql=get_MorePages_teshu(url_teshu,page) #搴楅摵鐨剈rl,鐖彇鐨勯〉鏁� - # a=remove_Duplicates_list(list_to_MySql) - # print('\n') - # for item in a: - # print(item) - if len(list_to_MySql) == 0 : - print('璇ュ搴楅摵鏃堕棿娈垫棤鏁版嵁锛屽凡璺宠繃') - continue - has_remove_duplicates = remove_Duplicates_list(list_to_MySql) #鍘婚櫎鏌愪釜搴楅摵鎸囧畾椤垫暟鎵�鏈夐噸澶嶇殑鏁版嵁 - is_minutes_exceed_30(has_remove_duplicates,con) # 灏嗘寚瀹氶〉鏁扮殑璁惧鏁呴殰鏁版嵁鍐欏叆鏁版嵁搴撳紓甯歌〃涓� - isExceeding(has_remove_duplicates,con) # 灏嗘寚瀹氶〉鏁版暟鎹啓鍏ユ暟鎹簱瓒呮爣琛ㄤ腑 - ea_t_dev(has_remove_duplicates,con) # 灏嗘寚瀹氶〉鏁版暟鎹啓鍏ユ暟鎹簱璁惧淇℃伅琛ㄤ腑 - fd_t_minbute(has_remove_duplicates,con) #灏嗘寚瀹氶〉鏁版暟鎹啓鍏ユ暟鎹簱鍒嗛挓鏁版嵁琛ㄤ腑 - - list_to_MySql.clear() - if url[begin:end]=='shop=%25E9%25A3%259F%25E5%2585%25B6%25E5%25AE%25B6': #椋熷叾瀹� - print('鍙戠幇鐗规畩鐨勶紒') - already_spider_shopnum += 1 #鐖幓鐨勫簵閾烘暟閲忓姞1 - teshu_url.append(url) - #鍐嶅垹闄や竴涓垪琛ㄥ厓绱� - url_teshu=url_add_time(url,date_begin,date_end) #缁欐墍鏈塽rl鍔犱笂鏃ユ湡 - list_to_MySql=get_MorePages_teshu_shiqijia(url_teshu,page) #搴楅摵鐨剈rl,鐖彇鐨勯〉鏁� - # b=remove_Duplicates_list(list_to_MySql) - # for item in b: - # print(item) - if len(list_to_MySql) == 0 : - print('璇ュ搴楅摵鏃堕棿娈垫棤鏁版嵁锛屽凡璺宠繃') - continue - has_remove_duplicates = remove_Duplicates_list(list_to_MySql) #鍘婚櫎鏌愪釜搴楅摵鎸囧畾椤垫暟鎵�鏈夐噸澶嶇殑鏁版嵁 - is_minutes_exceed_30(has_remove_duplicates,con) # 灏嗘寚瀹氶〉鏁扮殑璁惧鏁呴殰鏁版嵁鍐欏叆鏁版嵁搴撳紓甯歌〃涓� - isExceeding(has_remove_duplicates,con) # 灏嗘寚瀹氶〉鏁版暟鎹啓鍏ユ暟鎹簱瓒呮爣琛ㄤ腑 - ea_t_dev(has_remove_duplicates,con) # 灏嗘寚瀹氶〉鏁版暟鎹啓鍏ユ暟鎹簱璁惧淇℃伅琛ㄤ腑 - fd_t_minbute(has_remove_duplicates,con) #灏嗘寚瀹氶〉鏁版暟鎹啓鍏ユ暟鎹簱鍒嗛挓鏁版嵁琛ㄤ腑 - - list_to_MySql.clear() - for t in teshu_url: #浠巙rls琛ㄤ腑鍒犻櫎鐗规畩鐨� - urls.remove(t) - print(len(urls)) - return urls - -#------------------------------------------------------------------------------------------------------------- - - -def spider_all(con,page,date_begin=month_begin,date_end=now_date): #鐖彇鏂囦欢涓墍鏈夊簵閾�(鍖呮嫭鐗规畩鐨剈rl搴楅摵) 鏁版嵁搴撹繛鎺ュ璞� ,瑕佺埇鍙栫殑椤垫暟,寮�濮嬫椂闂�,缁撴潫鏃堕棿 - global already_spider_shopnum - url_all=[] - #urls=url_more() #杩斿洖鏂囦欢涓墍鏈夊簵閾虹殑url,甯︽渶澶ф樉绀烘潯鏁�100 - #鍋氫笉绗﹀悎鐨勫厛澶勭悊 - urls=spilt_url_teshu(con,page,date_begin,date_end) - - for url in urls: #缁欐墍鏈塽rl鍔犱笂鏃ユ湡 - url_all.append(url_add_time(url,date_begin,date_end)) - - for i in url_all: #鎵撳嵃鏈�缁堢殑url - print(i) - - for j in url_all: #鏍规嵁鎵�鏈塽rl鍐欏叆鏁版嵁搴� - list_to_MySql=get_MorePages(j,page) #搴楅摵鐨剈rl,鐖彇鐨勯〉鏁� - already_spider_shopnum += 1 #鐖幓鐨勫簵閾烘暟閲忓姞1 - # a=remove_Duplicates_list(list_to_MySql) - # print('\n\n') - # for item in a: - # print(item) - if len(list_to_MySql) == 0 : - print('璇ュ搴楅摵鏃堕棿娈垫棤鏁版嵁锛屽凡璺宠繃') - continue - has_remove_duplicates = remove_Duplicates_list(list_to_MySql) #鍘婚櫎鏌愪釜搴楅摵鎸囧畾椤垫暟鎵�鏈夐噸澶嶇殑鏁版嵁 - is_minutes_exceed_30(has_remove_duplicates,con) # 灏嗘寚瀹氶〉鏁扮殑璁惧鏁呴殰鏁版嵁鍐欏叆鏁版嵁搴撳紓甯歌〃涓� - isExceeding(has_remove_duplicates,con) # 灏嗘寚瀹氶〉鏁版暟鎹啓鍏ユ暟鎹簱瓒呮爣琛ㄤ腑 鍐欏叆寮傚父琛ㄤ腑 - ea_t_dev(has_remove_duplicates,con) # 灏嗘寚瀹氶〉鏁版暟鎹啓鍏ユ暟鎹簱璁惧淇℃伅琛ㄤ腑 - fd_t_minbute(has_remove_duplicates,con) #灏嗘寚瀹氶〉鏁版暟鎹啓鍏ユ暟鎹簱鍒嗛挓鏁版嵁琛ㄤ腑 - - list_to_MySql.clear() - -def back_cookie(): #浠庢枃浠朵腑璇诲彇cookie - global ck - with open("D:\\z\\workplace\\cookie.txt",'r') as fp: - ck=fp.read() - - -def write_Sql(list,con): #灏嗙綉绔欐暟鎹啓鍏ユ暟鎹簱 - data = pd.DataFrame(list,columns=['provider','shop_name','equipment_number','equipment_name','smoke_push_density','smoke_pop_density','wind_turbine','purifier','level','alarm_required','alarm_triggered','attribution_time','reporting_time','data_time']) - print("\n\n") - print(data) - # engine = create_engine("mysql+mysqlconnector://root:1234@localhost:3306/qianduan_sql?charset=utf8") - # con = engine.connect() - - # test3 瑕佸啓鍏ョ殑鏁版嵁琛紝杩欐牱鍐欑殑璇濊鎻愬墠鍦ㄦ暟鎹簱寤哄ソ琛� - data.to_sql(name="ed_data", con=con, if_exists="append",index=False,index_label=False) - # con.close() - print("鍐欏叆瀹屾垚!") - -ck="" #淇濆瓨cookie -shopnum=0 #鏂囦欢涓簵閾烘�绘暟 -already_spider_shopnum=0 #宸茬埇鍘荤殑搴楅摵鏁伴噺 -already_spider_datanum=0 #宸茬埇鍘荤殑鏁版嵁鏉℃暟 -sleeptime=4 - -Key_period_noon_begin = datetime.strptime('10:00',"%H:%M") #涓崍閲嶇偣鏃舵 -Key_period_noon_end = datetime.strptime('14:00',"%H:%M") - - -Key_period_night_begin = datetime.strptime('17:00',"%H:%M") #鏅氫笂閲嶇偣鏃舵 -Key_period_night_end = datetime.strptime('21:00',"%H:%M") - -def pass_login(): - global con_read - #"mysql+mysqlconnector://root:1234@localhost:3306/qianduan_sql?charset=utf8" - #engine = create_engine("mysql+mysqlconnector://root:1234@localhost:3306/qianduan_sql?charset=utf8") - engine = create_engine("mysql+mysqlconnector://root:1234@localhost:3306/qianduan_sql?charset=utf8") - con = engine.connect() - - back_cookie() # 浠庢枃浠朵腑璇诲彇cookie - - #鐖彇鎵�鏈夊簵閾� 骞惰绠楄�楁椂 - start_time=time.time() - - spider_all(con,55,'2023-06-01','2023-06-30') #鐖彇鏂囦欢涓墍鏈夌殑搴楅摵鍚� - - end_time=time.time() - # 鍏抽棴鏁版嵁搴撹繛鎺� - con_read.close() - con.close() - print("鍐欏叆瀹屾垚!") - print("璁剧疆鐖彇鐨勬椂闂撮棿闅斾负",sleeptime,"绉�") - print("鍏辨湁",shopnum,"瀹�","宸茬埇鍙�",already_spider_shopnum,"瀹�") - print("鍏辩埇鍙�",already_spider_datanum,"鏉¤褰�") - print("鍏辫�楁椂:{:.2f}绉�".format(end_time-start_time)) - -engine = create_engine("mysql+mysqlconnector://root:1234@localhost:3306/qianduan_sql?charset=utf8") -# 涓撻棬璇诲彇璁惧淇℃伅琛� -con_read = engine.connect() -pass_login() diff --git a/src/__pycache__/Crawling_1.cpython-38.pyc b/src/__pycache__/Crawling_1.cpython-38.pyc index 3a1d924..38340b2 100644 --- a/src/__pycache__/Crawling_1.cpython-38.pyc +++ b/src/__pycache__/Crawling_1.cpython-38.pyc Binary files differ diff --git a/src/__pycache__/auto_login.cpython-38.pyc b/src/__pycache__/auto_login.cpython-38.pyc index 75beb5e..4b964db 100644 --- a/src/__pycache__/auto_login.cpython-38.pyc +++ b/src/__pycache__/auto_login.cpython-38.pyc Binary files differ diff --git a/src/__pycache__/write_to_MySql.cpython-38.pyc b/src/__pycache__/write_to_MySql.cpython-38.pyc index 8728de9..ce117b5 100644 --- a/src/__pycache__/write_to_MySql.cpython-38.pyc +++ b/src/__pycache__/write_to_MySql.cpython-38.pyc Binary files differ diff --git a/src/__pycache__/write_to_minute_table.cpython-38.pyc b/src/__pycache__/write_to_minute_table.cpython-38.pyc index 03e0342..a19315b 100644 --- a/src/__pycache__/write_to_minute_table.cpython-38.pyc +++ b/src/__pycache__/write_to_minute_table.cpython-38.pyc Binary files differ diff --git a/src/core_modules/__pycache__/remove_duplicates_methods.cpython-38.pyc b/src/core_modules/__pycache__/remove_duplicates_methods.cpython-38.pyc index b77dd82..e4fb830 100644 --- a/src/core_modules/__pycache__/remove_duplicates_methods.cpython-38.pyc +++ b/src/core_modules/__pycache__/remove_duplicates_methods.cpython-38.pyc Binary files differ diff --git a/src/user/__pycache__/account_photo.cpython-38.pyc b/src/user/__pycache__/account_photo.cpython-38.pyc index d569f67..cb32133 100644 --- a/src/user/__pycache__/account_photo.cpython-38.pyc +++ b/src/user/__pycache__/account_photo.cpython-38.pyc Binary files differ -- Gitblit v1.9.3