¶Ô±ÈÐÂÎļþ |
| | |
| | | #sum å¤é¡µ å
¥åºæå ç¬åæä»¶ä¸ææçåºéº ç½é¡µå®æ´è¡¨ç«¯ å»é¤é夿°æ® éå°ç©ºé¡µé¢ä¼è·³å°ä¸ä¸å®¶åºéº ãéå°æå®¶åºéºæ æ°æ®ï¼è·³è¿å»ä¸ä¸å®¶ |
| | | #ç¬å»æå®¶åºéºæå®ç页æ°ï¼ä¸é¡µè®°å½æ°é»è®¤å¤§å°ä¸º100æ¡ï¼ï¼æ¯å¦ç¬å12页ï¼åç¬å12页åå°ç»æä¸æ¬¡æ§åå
¥æ°æ®åº |
| | | #ç¬å»æ´ä¸ªé¡µé¢è¡¨ç»æï¼ååå«åå
¥4å¼ è¡¨ä¸ï¼ç¬åçæ°æ®åå
¥ä¸¤å¼ 表ä¸ï¼è¿æè¶
æ 表 å¼å¸¸è¡¨ï¼ |
| | | #ç½é¡µä¸å段å
±14个ï¼åå
¥æ°æ®åºæ¯15个ï¼åºå·+14ï¼ |
| | | import requests |
| | | from bs4 import BeautifulSoup as bs |
| | | from aip import AipOcr #ç¾åº¦æåè¯å« |
| | | import re #æ£åè¡¨è¾¾å¼ |
| | | from pymysql import * # è¿æ¥mysqlæ°æ®åº |
| | | import pandas as pd |
| | | from sqlalchemy import create_engine |
| | | import urllib.parse #urlåéç¼ç |
| | | import time |
| | | import uuid |
| | | from datetime import datetime, timedelta |
| | | |
| | | import sys |
| | | sys.path.append('D:\\z\workplace\\VsCode\\show') |
| | | import core_modules.remove_duplicates_methods as rdm |
| | | |
| | | |
| | | now_date = time.strftime("%Y-%m-%d", time.localtime()) #è·åå½åå¹´ææ¥ #urlç¼ç å¹´ææ¥å¼å§é»è®¤æ¶é´ |
| | | now_date1 = time.strftime("%Y-%m", time.localtime()) |
| | | month_begin=now_date1+'-01' #设置å½åæä»½çå¼å§ |
| | | |
| | | list_temp=[] #临æ¶å表 å
¨å±åé |
| | | |
| | | |
| | | |
| | | def remove_Duplicates_list(list): #å表èªèº«å»é |
| | | global already_spider_datanum |
| | | list_store=[] |
| | | for item in list: |
| | | if item not in list_store: |
| | | list_store.append(item) |
| | | else: |
| | | print("åç°éå¤") |
| | | already_spider_datanum=already_spider_datanum-1 |
| | | #print(list_store) |
| | | return list_store |
| | | |
| | | def merge(list): #åå¹¶liståæ°å
个å
ç´ |
| | | date_1=str(list.pop(-1)) #å é¤å°¾å
ç´ åè¿è½ç»§ç»ä½¿ç¨æ¹å
ç´ ï¼ |
| | | date_2=str(list.pop(-1)) |
| | | date1=date_2+' '+date_1 #åå¹¶ä¸ºå¹´ææ¥æ¶åç§ |
| | | |
| | | date_3=str(list.pop(-1)) |
| | | date_4=str(list.pop(-1)) |
| | | date2=date_4+' '+date_3 |
| | | |
| | | date_5=str(list.pop(-1)) |
| | | date_6=str(list.pop(-1)) |
| | | date3=date_6+' '+date_5 |
| | | list.append(date3) #å°åå¹¶çæ°æ®åä¼listå表ç»å°¾. |
| | | list.append(date2) |
| | | list.append(date1) |
| | | |
| | | |
| | | return list |
| | | |
| | | def list_url(url,page_num): #urlä¸çiæ¯é¡µ ,apge_num表示ç¬åçé¡µæ° ãurlåé¢å ä¸é¡µçåæ° |
| | | urls = [url+'&page'+'={}'.format(str(i)) for i in range(1,page_num+1)] |
| | | return urls # è¿å该url对åºé¡µçææé¾æ¥å½¢å¼ï¼è¿åå¼ä¸ºå表 |
| | | |
| | | |
| | | def get_OnePage(url,count): #æåä¸é¡µçæ°æ®,æ¾å
¥list_dataä¸.urls为è¦è®¿é®çç½é¡µå°å |
| | | global ck |
| | | global list_temp #使ç¨å
¨å±åé |
| | | |
| | | list_temp.clear() #æ¸
空临æ¶è¡¨ |
| | | headers = { |
| | | # æ¤å¤æ³¨æcookieï¼è¦èªå·±æå |
| | | "Cookie":ck, |
| | | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36", |
| | | } |
| | | r = requests.get(url=url, headers=headers, verify=False).text |
| | | soup = bs(r,'html.parser') |
| | | |
| | | list=[] #å建å表æ¥ä¿åç»æ |
| | | tags = soup.find_all("tr") # å表ææè¡ |
| | | for tag in tags: # æ¯ä¸ªtagæ¯ä¸è¡ |
| | | count=count+1 |
| | | element = tag.text # è·å<tr>æ ç¾å
ææææ¬ä¿¡æ¯ |
| | | element = element.strip() # å°å符串é¦å°¾ç©ºæ ¼å»é¤ |
| | | list1 = element.split(); # ä»¥ç©ºæ ¼ä¸ºåéå°å符串å为å表 |
| | | |
| | | del (list1[-2:]) #å表æå两个å
ç´ ä¸éè¦,å é¤ |
| | | list1.insert(3,'') |
| | | list.append(list1) #listä¿åææè¡ |
| | | |
| | | #print(list) |
| | | |
| | | |
| | | list_data=[] #ä¿ååå¹¶æ¥æçä¸é¡µæ°æ® |
| | | for i in list: |
| | | list_data.append(merge(i)) #å·²å°å°¾æ¥ææ°æ®åå¹¶æå¹´ææ¥ æ¶åç§ æ¤æ¶å½¢æå®æ´çæ°æ®. |
| | | |
| | | del list_data[0] #å 餿å表头 |
| | | count=count-1 #å é¤äºè¡¨å¤´,æ»æ°æ®çè¡æ°åä¸ |
| | | #list_removeD= remove_Duplicates_list(list_data) #list_dateä¿åçæ¯ä¸é¡µçæ°æ® |
| | | |
| | | #print(list_data) |
| | | list_temp=list_data[:] |
| | | #list_temp=remove_Duplicates_list(list_data)[:] #å°ä¸é¡µæææ°æ®å¤å¶ç»ä¸´æ¶å表list_temp æ¯å»é¤éå¤åçå表 |
| | | return count |
| | | |
| | | |
| | | |
| | | def get_MorePages(url,page_num): #ç¬åæå®åºéºåçå¤é¡µæ°æ®,apge_num表示ç¬åçé¡µæ° |
| | | global sleeptime |
| | | global already_spider_datanum |
| | | urls=list_url(url,page_num) #å¾å°éè¦éåç页çurl |
| | | count_all=0 #ä¿åæ°æ®çæ»è¡æ° |
| | | list_all=[] #ä¿åç¬åçææçæ°æ® |
| | | page=1 |
| | | for i in urls: |
| | | count=0 |
| | | count_all=count_all+get_OnePage(i,count) |
| | | if len(list_temp)==0: #å¦æè¯¥é¡µä¸ºç©ºï¼å表示该页åé¢é½æ æ°æ® éåºå¾ªç¯ |
| | | print('åé¢é¡µæ°ä¸ºç©ºï¼ç¬å»ä¸ä¸ä¸ªåºéº') |
| | | break #éåºå¾ªç¯ |
| | | list_all.extend(list_temp) #å°ä¸é¡µæ°æ®å表追å å°list_allä¸ |
| | | print("ç¬åäºç¬¬",page,"页") |
| | | page=page+1 |
| | | print("\n") |
| | | time.sleep(sleeptime) #é´é2ç§è¯·æ±ä¸æ¬¡ |
| | | |
| | | for j in list_all: |
| | | print(j) #æå°åè¡¨ä¸æ¯ä¸è¡ |
| | | print("æ»è¡æ°ä¸º:",count_all) |
| | | already_spider_datanum += count_all #å·²ç¬åæ°æ®çæ»å |
| | | |
| | | return list_all |
| | | #return remove_Duplicates_list(list_all) #忬¡å¯¹åè¡¨è¿æ»¤éå¤ |
| | | |
| | | |
| | | def url_more(): #è¿åæä»¶ä¸éºåç¼ç å½¢æurl,è¿å弿¯urlå表 é»è®¤æ¥çç½é¡µçæå¤§æ¾ç¤ºæ¡æ°100 |
| | | global shopnum |
| | | shopnames = [] #ä¿å䏿åºéºåç§° |
| | | with open("D:\\z\\workplace\\shopname.txt",encoding='utf-8') as file: #å°æä»¶ä¸åºéºååä¿åå°åè¡¨ä¸ |
| | | for line in file: |
| | | line = line.strip() #or some other preprocessing |
| | | shopnames.append(line) #storing everything in memory! |
| | | #print(type(shopnames[0])) |
| | | #ç¼ç |
| | | shopnum=len(shopnames) #æä»¶ä¸åºéºæ»æ° |
| | | shopname_encoding=[] #ä¿åç¼ç åçåºéºåç§° |
| | | i=0 |
| | | for name in shopnames: |
| | | shopname_encoding.append(urllib.parse.quote(urllib.parse.quote(shopnames[i]))) #åºéºåç§°è¿è¡åéurlç¼ç |
| | | i=i+1 |
| | | #æ¼æ¥ç½åå½¢æå¯ç¨çurl |
| | | urls=[] #ä¿åæ¼æ¥åçurl |
| | | for shop in shopname_encoding: |
| | | url='http://xhhb.senzly.cn/sys/yyRealTimeValue_list.jsp?key1=&shop='+shop+'&pagesize=100' |
| | | urls.append(url) |
| | | # for i in urls: |
| | | # print(i) |
| | | return urls #è¿åæä»¶ä¸åºéºå称对åºçurl |
| | | |
| | | #æ ¹æ®å¼å§åç»ææ¥ææ¥æ¼æ¥url |
| | | def url_add_time(url,date_begin=month_begin,date_end=now_date): #url,å¹´-æ-æ¥ 2023-05-03 |
| | | url_date=url+'&key5='+date_begin+'&key6='+date_end |
| | | print(url_date) |
| | | return url_date |
| | | |
| | | #------------------------------------------------------------------------------------------------------------è¶
æ æ²¹çæ°æ®åå
¥å¼å¸¸è¡¨ä¸ |
| | | #两æ¶é´æ¯å¦ç¸å·®10åé æ¯åè¿åTRUE å¦åè¿åFALSE |
| | | def is_time_difference_equals_10_mins(datestr1, datestr2): |
| | | date1 = datetime.strptime(datestr1, "%Y-%m-%d %H:%M") |
| | | date2 = datetime.strptime(datestr2, "%Y-%m-%d %H:%M") |
| | | time_diff = date2 - date1 |
| | | |
| | | return time_diff == timedelta(minutes = 10) or time_diff == timedelta(minutes = -10) #timedelta() 表示两个 date 对象æè
time 对象,æè
datetime 对象ä¹é´çæ¶é´é´é |
| | | |
| | | |
| | | #æ¯éååé䏿¬¡ä¸ºæ£å¸¸ã æ¾åºè¶
è¿10åéçé´æç¹ |
| | | def find_break_point(list): #list为è¶
æ æ°æ®çå表 |
| | | i=0 |
| | | j=1 |
| | | break_point = [] #ä¿åé´æç¹ |
| | | for item in list[1:]: |
| | | if(is_time_difference_equals_10_mins(list[i][2],item[2]) == False): |
| | | break_point.append(j) |
| | | i=i+1 |
| | | j=j+1 |
| | | print('é´æç¹ä¸ºï¼') |
| | | print(break_point) |
| | | |
| | | #åå
¥é´æç¹ |
| | | return break_point |
| | | |
| | | |
| | | |
| | | #æ ¹æ®é´æç¹å°å表å岿å 个åå表ï¼ç±resultè¿å |
| | | def point_write(list,b_point): #list为å表ãb_pointå表å
ç´ ä¸ºé´æç¹ï¼é´æç¹å¼ä»å°å°å¤§ |
| | | result = [] |
| | | last_index = 0 |
| | | for index in b_point: |
| | | result.append(list[last_index:index]) #çµæ´» |
| | | last_index=index |
| | | result.append(list[last_index:]) |
| | | return result |
| | | |
| | | |
| | | #å°è®¾å¤æ
éä¿¡æ¯åå
¥abnormal_dataå¼å¸¸è¡¨ä¸ |
| | | def abnormal_write_to_SQL(list,con): |
| | | data = pd.DataFrame(list,columns=['dev_id','exception','exception_type','region','begin_time','end_time']) |
| | | print("\n\n") |
| | | print(data) |
| | | # engine = create_engine("mysql+mysqlconnector://root:1234@localhost:3306/qianduan_sql?charset=utf8") |
| | | # con = engine.connect() |
| | | |
| | | # test3 è¦åå
¥çæ°æ®è¡¨ï¼è¿æ ·åçè¯è¦æå卿°æ®åºå»ºå¥½è¡¨ |
| | | data.to_sql(name="abnormal_data", con=con, if_exists="append",index=False,index_label=False) |
| | | # con.close() |
| | | |
| | | |
| | | |
| | | def exception(list,con): #list为è¶
æ æ°æ®çå表 |
| | | break_point=find_break_point(list) #è¿åé´æç¹ |
| | | split_list=point_write(list,break_point) #æ ¹æ®é´æç¹å°åå§å表å岿å 个åå表 split_list为ä¸å±æ°ç»,å½¢å¼ä¸º[[[1,2],[4,'g']],[[8,'2'],['4','g']],[[1,2],[4,'g']]] |
| | | # print('è¶
æ æ¶é´æ®µååæçåå表为ï¼ï¼') |
| | | # for i in split_list: |
| | | # print(i) |
| | | print('\n') |
| | | abnormal=[] #éç»å¥½çå¼å¸¸è¡¨æ°æ® |
| | | |
| | | for item in split_list: #ä»åå²çæ°ç»ä¸æåéè¦çæ¶é´ä¿¡æ¯ï¼å¹¶æ·»å æ°çä¿¡æ¯æ°æ® |
| | | temp=[] |
| | | temp.append(item[0][0]) #设å¤ç¼å· |
| | | temp.append('æ°æ®å¼å¸¸') #设å¤ç¼å· |
| | | temp.append('0') #æ²¹çæµåº¦è¶
æ |
| | | temp.append('徿±åº') |
| | | temp.append(item[len(item)-1][2]) #å䏿¡è®°å½çå½å±æ¶é´ å¼å§æ¶é´ |
| | | temp.append(item[0][2]) #å½å±æ¶é´ ç»ææ¶é´ |
| | | abnormal.append(temp) |
| | | |
| | | print(abnormal) |
| | | |
| | | print('è¶
æ å¼å¸¸æ¶é´æ®µæ°æ®ä¸ºï¼') |
| | | for j in abnormal: |
| | | print(j) |
| | | abnormal_write_to_SQL(abnormal,con) #åå
¥å¼å¸¸è¡¨ä¸ |
| | | print("è¶
æ æ²¹çæ°æ®å¼å¸¸è¡¨åå
¥å®æ!") |
| | | |
| | | #------------------------------------------------------------------------------------------------------------è®¾å¤æ
éæ°æ®åå
¥å¼å¸¸è¡¨ä¸ |
| | | #两æ¶é´æ¯å¦ç¸å·®30åé æ¯åè¿åTRUE å¦åè¿åFALSE |
| | | def is_time_difference_equals_30_mins(datestr1, datestr2): |
| | | date1 = datetime.strptime(datestr1, "%Y-%m-%d %H:%M") |
| | | date2 = datetime.strptime(datestr2, "%Y-%m-%d %H:%M") |
| | | time_diff = date2 - date1 |
| | | return time_diff > timedelta(minutes=30) |
| | | |
| | | #æ¾åºè®¾å¤æ
éçä¿¡æ¯ï¼å¹¶å°æ¤ä¿¡æ¯åå
¥å¼å¸¸è¡¨ä¸ |
| | | def is_minutes_exceed_30(list,con) : # list为æåºéºæå®é¡µæ°çå
¨é¨çè®°å½ listå
ç´ ä¸çæ¶é´ä¸ºååºæåï¼å³ä»å¤§å°å° |
| | | device_failure=[] #åå¨è®¾å¤æ
éçæ°æ® |
| | | startTime = list[0][11] |
| | | print('å¼å§æ¶é´ï¼',startTime) |
| | | for item in list[1:] : |
| | | if is_time_difference_equals_30_mins(item[11],startTime) : #å¿
须大äº30åé ä¸è½çäº30åé |
| | | temp=[] |
| | | temp.append(item[2]) #设å¤ç¼å· |
| | | temp.append('è®¾å¤æ
é') #设å¤ç¼å· |
| | | temp.append('1') #è®¾å¤æ
é |
| | | temp.append('徿±åº') |
| | | temp.append(item[11]) #æ
éå¼å§æ¶é´ |
| | | startTimeSub= datetime.strptime(startTime,"%Y-%m-%d %H:%M") - timedelta(minutes = 10) #ç»æä¸ºdatetime.datetimeç±»å ï¼éè¦å转为å符串类å |
| | | print('ç¸ååç»æï¼',str(startTimeSub)) |
| | | print('ç¸ååç±»åï¼',type(str(startTimeSub))) |
| | | temp.append(str(startTimeSub)[:16]) #æ
éç»ææ¶é´ |
| | | device_failure.append(temp) |
| | | startTime = item[11] |
| | | print('è®¾å¤æ
éçæ°æ®ä¸ºï¼') |
| | | for i in device_failure : |
| | | print(i) |
| | | not_Key_period_exceed_30_minutes(device_failure,con) #å°ä¾çµå¼å¸¸ä¿¡æ¯åå
¥å¼å¸¸è¡¨ |
| | | #abnormal_write_to_SQL(device_failure,con) #å°è®¾å¤æ
éä¿¡æ¯åå
¥å¼å¸¸è¡¨ |
| | | print('ä¾çµå¼å¸¸/æçº¿ä¿¡æ¯åå
¥å¼å¸¸è¡¨å®æ!') |
| | | #-----------------------------------------------------------------------------------------------------------ä¾çµå¼å¸¸æ°æ®åå
¥å¼å¸¸è¡¨ä¸ |
| | | #å¼å§åç»ææ¶é´é½å¤äºééç¹æ¶æ®µæ¶,è¿åtrue |
| | | def is_time_not_between_key_period(begin_time,end_time) : #å½¢åä¸ºæ¥æå符串,å½¢å¦ '2023-06-21 14:30' |
| | | global Key_period_noon_begin,Key_period_noon_end,Key_period_night_begin,Key_period_night_end |
| | | # #ä¸åéç¹æ¶æ®µ |
| | | # Key_period_noon_begin = datetime.strptime('10:00',"%H:%M") |
| | | # Key_period_noon_end = datetime.strptime('14:00',"%H:%M") |
| | | |
| | | # #æä¸éç¹æ¶æ®µ |
| | | # Key_period_night_begin = datetime.strptime('17:00',"%H:%M") |
| | | # Key_period_night_end = datetime.strptime('21:00',"%H:%M") |
| | | |
| | | begin1 = datetime.strptime(begin_time[11:],"%H:%M") |
| | | end1 = datetime.strptime(end_time[11:],"%H:%M") |
| | | |
| | | #å½å¼å§åç»ææ¶é´é½å¤äºééç¹æ¶æ®µæ¶ï¼å°è¯¥æ¡æ
éä¿¡æ¯åæ¶è®°å½ä¸ºï¼ çä¼¼ä¾çµå¼å¸¸ |
| | | if ((( begin1 > Key_period_noon_begin and begin1 < Key_period_noon_end ) or ( begin1 > Key_period_night_begin and begin1 < Key_period_night_end )) or (( end1 > Key_period_noon_begin and end1 < Key_period_noon_end ) or ( end1 > Key_period_night_begin and end1 < Key_period_night_end ))) ==False : |
| | | print('å¼å§æç»ææ¶é´æ¶é´å¨ééç¹æ¶æ®µ') |
| | | return True |
| | | print('å¤äºéç¹æ¶æ®µ') |
| | | return False |
| | | |
| | | #å¼å§åç»ææ¶é´é½å¤äºéç¹æ¶æ®µæ¶,è¿åtrue |
| | | def is_time_between_key_period(begin_time,end_time) : #å½¢åä¸ºæ¥æå符串,å½¢å¦ '2023-06-21 14:30' |
| | | global Key_period_noon_begin,Key_period_noon_end,Key_period_night_begin,Key_period_night_end |
| | | # #ä¸åéç¹æ¶æ®µ |
| | | # Key_period_noon_begin = datetime.strptime('10:00',"%H:%M") |
| | | # Key_period_noon_end = datetime.strptime('14:00',"%H:%M") |
| | | |
| | | # #æä¸éç¹æ¶æ®µ |
| | | # Key_period_night_begin = datetime.strptime('17:00',"%H:%M") |
| | | # Key_period_night_end = datetime.strptime('21:00',"%H:%M") |
| | | |
| | | begin1 = datetime.strptime(begin_time[11:],"%H:%M") |
| | | end1 = datetime.strptime(end_time[11:],"%H:%M") |
| | | |
| | | #å½å¼å§åç»ææ¶é´é½å¤äºéç¹æ¶æ®µæ¶ï¼å°è¯¥æ¡æ
éä¿¡æ¯åæ¶è®°å½ä¸ºï¼ æçº¿ |
| | | if ((begin1 > Key_period_noon_begin and begin1 < Key_period_noon_end) and ( end1 > Key_period_noon_begin and end1 < Key_period_noon_end )) or ( (begin1 > Key_period_night_begin and begin1 < Key_period_night_end) and ( end1 > Key_period_night_begin and end1 < Key_period_night_end )) : |
| | | print('å¼å§æç»ææ¶é´å¤äºéç¹æ¶æ®µ') |
| | | return True |
| | | print('å¤äºééç¹æ¶æ®µ') |
| | | return False |
| | | |
| | | |
| | | |
| | | def not_Key_period_exceed_30_minutes(list,con) : #listä¸ºè®¾å¤æ
éçæ¶é´æ®µæ°æ® |
| | | power_supply_abnormal = [] #ä¿åä¾çµå¼å¸¸ææçº¿çä¿¡æ¯ |
| | | for item in list : |
| | | if is_time_not_between_key_period(item[4],item[5]) : #else: |
| | | temp = [] |
| | | temp.append(item[0]) |
| | | temp.append('è®¾å¤æ
é') |
| | | temp.append('1') #çä¼¼ä¾çµå¼å¸¸ |
| | | temp.append('徿±åº') |
| | | temp.append(item[4]) |
| | | temp.append(item[5]) |
| | | power_supply_abnormal.append(temp) |
| | | elif is_time_between_key_period(item[4],item[5]) : |
| | | temp = [] |
| | | temp.append(item[0]) |
| | | temp.append('è®¾å¤æ
é') |
| | | temp.append('2') #æçº¿ |
| | | temp.append('徿±åº') |
| | | temp.append(item[4]) |
| | | temp.append(item[5]) |
| | | power_supply_abnormal.append(temp) |
| | | print('ä¾çµå¼å¸¸çæ°æ®ä¸ºï¼') |
| | | for i in power_supply_abnormal : |
| | | print(i) |
| | | |
| | | #å°ä¾çµå¼å¸¸çä¿¡æ¯åå
¥æ°æ®åºå¼å¸¸è¡¨ä¸ |
| | | abnormal_write_to_SQL(power_supply_abnormal,con) #å°è®¾å¤æ
éä¿¡æ¯åå
¥å¼å¸¸è¡¨ |
| | | print('ä¾çµå¼å¸¸çä¿¡æ¯åå
¥å¼å¸¸è¡¨å®æ!') |
| | | |
| | | |
| | | |
| | | #------------------------------------------------------------------------------------------------------------åå
¥è¶
æ è¡¨ä¸ |
| | | |
| | | #è¿åéç»åçå表 |
| | | def refind_ex(list): #list为ç½é¡µç䏿¡è®°å½ |
| | | temp=[] |
| | | temp.append(list[2]) #设å¤ç¼å· |
| | | temp.append(list[12]) #䏿¥æ¶é´ |
| | | temp.append(list[11]) #å½å±æ¶é´ |
| | | temp.append(list[6]) #飿ºçµæµ 6 |
| | | temp.append(list[7]) #ååå¨çµæµ7 |
| | | temp.append(list[4]) #è¿æ²¹çæµåº¦å¼ |
| | | temp.append(list[5]) #ææ²¹çæµåº¦å¼ |
| | | |
| | | print(temp) |
| | | return temp |
| | | |
| | | |
| | | #å°å表åå
¥exceeding_st_dataè¡¨ä¸ |
| | | def ex_write_to_SQL(list,con): |
| | | data = pd.DataFrame(list,columns=['MV_Stat_Code','MV_Create_Time','MV_Data_Time','MV_Fan_Electricity','MV_Purifier_Electricity','MV_Fume_Concentration','MV_Fume_Concentration2']) |
| | | print("\n\n") |
| | | print(data) |
| | | #engine = create_engine("mysql+mysqlconnector://root:1234@localhost:3306/qianduan_sql?charset=utf8") |
| | | #con = engine.connect() |
| | | |
| | | # test3 è¦åå
¥çæ°æ®è¡¨ï¼è¿æ ·åçè¯è¦æå卿°æ®åºå»ºå¥½è¡¨ |
| | | data.to_sql(name="exceeding_st_data", con=con, if_exists="append",index=False,index_label=False) |
| | | #con.close() |
| | | print("è¶
æ 表åå
¥å®æ!") |
| | | |
| | | |
| | | # list为æåºéºæå®é¡µæ°çå
¨é¨çè®°å½ å°è¶
æ æ°æ®åå
¥è¶
æ 表 |
| | | def isExceeding(list,con): #list为æåºéºæå®é¡µæ°çå
¨é¨çè®°å½ listå
ç´ ä¸ºåè¡¨å½¢å¼ |
| | | exceedingData=[] #ä¿åè¶
æ çæ°æ® |
| | | for item in list: #æ¥æ¾è¶
æ çæ°æ®ï¼å¹¶è®°å½ä¸ |
| | | if float(item[5]) > 1: # æçæµåº¦å¤§äº1åè¶
æ |
| | | print("è¯¥æ¡æ°æ®è¶
æ ") |
| | | #ä¿å该æ¡è®°å½ï¼æåéè¦çå¼ï¼å¹¶æ·»å å
¶ä»å段 |
| | | exceedingData.append(refind_ex(item)) |
| | | |
| | | |
| | | for i in exceedingData: #éåå表 |
| | | print(i) |
| | | |
| | | if(len(exceedingData) != 0) : #æè¶
æ æ°æ®æ¶ææ§è¡ |
| | | #å°è¶
æ æ°æ®æ¶é´åç±»ååabnormal_dataå¼å¸¸è¡¨ä¸ |
| | | exception(exceedingData,con) |
| | | |
| | | #å°è¶
æ æ°æ®ç´æ¥åå
¥æ°æ®åºè¶
æ è¡¨ä¸ |
| | | ex_write_to_SQL(exceedingData,con) |
| | | else: |
| | | print('该åºéºæ è¶
æ æ°æ®') |
| | | |
| | | |
| | | #------------------------------------------------------------------------------------------------------------æ°æ®åå
¥è®¾å¤ä¿¡æ¯è¡¨ |
| | | def generate_short_uuid(): |
| | | arrayOf=[ |
| | | "a", |
| | | "b", |
| | | "c", |
| | | "d", |
| | | "e", |
| | | "f", |
| | | "g", |
| | | "h", |
| | | "i", |
| | | "j", |
| | | "k", |
| | | "l", |
| | | "m", |
| | | "n", |
| | | "o", |
| | | "p", |
| | | "q", |
| | | "r", |
| | | "s", |
| | | "t", |
| | | "u", |
| | | "v", |
| | | "w", |
| | | "x", |
| | | "y", |
| | | "z", |
| | | "0", |
| | | "1", |
| | | "2", |
| | | "3", |
| | | "4", |
| | | "5", |
| | | "6", |
| | | "7", |
| | | "8", |
| | | "9", |
| | | "A", |
| | | "B", |
| | | "C", |
| | | "D", |
| | | "E", |
| | | "F", |
| | | "G", |
| | | "H", |
| | | "I", |
| | | "J", |
| | | "K", |
| | | "L", |
| | | "M", |
| | | "N", |
| | | "O", |
| | | "P", |
| | | "Q", |
| | | "R", |
| | | "S", |
| | | "T", |
| | | "U", |
| | | "V", |
| | | "W", |
| | | "X", |
| | | "Y", |
| | | "Z" |
| | | ] |
| | | list=[] |
| | | ui=str(uuid.uuid4()).replace('-', '') |
| | | for i in range(0,16): |
| | | a1=ui[i*2:i*2+2] |
| | | x=int(a1,16) |
| | | list.append(arrayOf[x % 0x3E]) |
| | | return ''.join(list) |
| | | |
| | | |
| | | #è¿åéç»åçå表 |
| | | def refind_ea(list): #䏿¡è®°å½ï¼ä¹å°±æ¯ä¸ä¸ªå表 |
| | | temp=[] |
| | | temp.append(generate_short_uuid()) |
| | | temp.append(list[2]) |
| | | temp.append(list[1]) |
| | | temp.append(list[0]) |
| | | temp.append(1) |
| | | print(temp) |
| | | return temp |
| | | |
| | | #å°å表åå
¥è®¾å¤ä¿¡æ¯è®¾å¤ä¿¡æ¯ea_t_devè¡¨ä¸ |
| | | def ea_write_to_SQL(list,con): |
| | | data = pd.DataFrame(list,columns=['DI_GUID','DI_Code','DI_Name','DI_Supplier','DI_Online']) |
| | | print("\n\n") |
| | | print('åå
¥æ°æ®è¡¨ ï¼DateFrame为ï¼',data) |
| | | |
| | | # test3 è¦åå
¥çæ°æ®è¡¨ï¼è¿æ ·åçè¯è¦æå卿°æ®åºå»ºå¥½è¡¨ |
| | | data.to_sql(name="ea_t_device_info", con=con, if_exists="append",index=False,index_label=False) |
| | | print("设å¤ä¿¡æ¯è¡¨åå
¥å®æ!") |
| | | |
| | | |
| | | def dev_info_data_if_exisitd(list,con): #list为ç¬åæå®¶åºéºæå®é¡µæ°è½¬æ¢åçæ°æ® |
| | | global con_read |
| | | #å建第äºä¸ªæ°æ®åºè¿æ¥ |
| | | # engine = create_engine("mysql+mysqlconnector://root:1234@localhost:3306/qianduan_sql?charset=utf8") |
| | | # con_read = engine.connect() |
| | | |
| | | df = pd.read_sql('SELECT DI_Code,DI_Name,DI_Supplier FROM ea_t_device_info',con=con_read) #ä»è®¾å¤ä¿¡æ¯è¡¨ä¸è¯»å设å¤ç¼å·ï¼åºéºåï¼ä¾åºååæ®µçæ°æ®ãè¿å弿¯DateFrameç±»å |
| | | # con_read.close() #å
³é龿¥ |
| | | |
| | | res = df.values.tolist() #DateFrameæç
§è¡è½¬ælistç±»åï¼resåæ¾çæ¯è®¾å¤ä¿¡æ¯è¡¨ä¸çæ°æ® |
| | | print('******** 设å¤ä¿¡æ¯******') |
| | | for i in res: |
| | | print(i) |
| | | print('设å¤ä¿¡æ¯è¡¨è®°å½æ¡æ°ä¸ºï¼',len(res)) |
| | | |
| | | list1 = rdm.remove_duplicates_dev_info(list) #设å¤ç¼å·ï¼åºéºåï¼ä¾åºåç¸çæ¶ï¼å为éå¤ï¼å»é¤ãlist1为å»éåç |
| | | if len(res) > 0 : #设å¤è¡¨ä¸ææ°æ® |
| | | #æ¯è¾ |
| | | temp=list1[:] #å°list1æ°æ®ç»tempï¼éåtemp,è¥ç¸çï¼ä»listä¸å 餿°æ®ï¼é¿å
ä¸ä¸ªåè¡¨åæ¶éåä¸å é¤ |
| | | print('å»é¤éå¤ä¸º:') |
| | | print(list1) |
| | | for item in temp: |
| | | if item[1:4] in ( x[:] for x in res ) : #å¾
åå
¥æ°æ®åºçå¼ä¸è®¾å¤è¡¨ä¸æ°æ®ç¸çæ¶,å°å¾
åå
¥çå¼ä»listä¸ç§»é¤ |
| | | list1=rdm.remove_given_data_dev_info(list1,item[1:4]) #该itemä»list1ä¸ç§»é¤ |
| | | |
| | | print('设å¤ä¿¡æ¯è¡¨ä¸ææ°æ®æ¶ï¼å»éåçlist为ï¼',list1) |
| | | if( len(list1) != 0 ) : #å é¤åä¸ä¸ºç©ºæ¶ï¼åå
¥ |
| | | ea_write_to_SQL(list1,con) #å°å表åå
¥ea_t_devè¡¨ä¸ |
| | | else : #设å¤è¡¨ä¸æ æ°æ® |
| | | # a=rdm.remove_duplicates_dev_info(list) #设å¤ç¼å·ï¼åºéºåï¼ä¾åºåç¸çæ¶ï¼å为éå¤ï¼å»é¤ |
| | | print('设å¤è¡¨æ æ°æ®ï¼å¤çåå¾
åå
¥ç设å¤ä¿¡æ¯ä¸ºï¼',list1) |
| | | #å°å»éåæ°æ®åå
¥è®¾å¤ä¿¡æ¯è¡¨ |
| | | ea_write_to_SQL(list1,con) #å°å表åå
¥è®¾å¤è¡¨ä¸ ã 第ä¸ä¸ªåæ°ï¼è®¾å¤ç¼å·ï¼åºéºåï¼ä¾åºåç¸çæ¶ï¼å为éå¤ï¼å»é¤ |
| | | |
| | | |
| | | |
| | | #å°åå§æ°æ®è½¬åææ°çå表ï¼ååå
¥è®¾å¤ä¿¡æ¯è®¾å¤ä¿¡æ¯è¡¨ä¸ /åå
¥ |
| | | def ea_t_dev(list,con): #æå®¶åºéºçå¶å®é¡µçæ°æ®è®°å½ ï¼listå表å
ç´ ä¾ç¶ä¸ºåè¡¨ï¼æ¯å¦[[1,2,3,'a'],[52,3,'a'],[6,2,3,'a']] ï¼conä¸ºæ°æ®åºçå»ºç« |
| | | staging=[] #表示转æ¢åçå表 |
| | | for item in list: |
| | | #æåéè¦çå¼ï¼å¹¶æ·»å å
¶ä»å段 |
| | | staging.append(refind_ea(item)) #转å |
| | | print('è®¾å¤æ°æ®è½¬ååï¼') |
| | | for i in staging: |
| | | print(i) |
| | | |
| | | #æ¥è¯¢è®¾å¤è¡¨å·²åçæ°æ®ï¼è¥å·²åå¨è®¾å¤ä¿¡æ¯ï¼åä¸åå
¥ |
| | | dev_info_data_if_exisitd(staging,con) |
| | | |
| | | |
| | | #----------------------------------åå
¥åéæ°æ®è¡¨ |
| | | |
| | | #è¿åéç»åçå表 |
| | | def refind_fd(list): #䏿¡è®°å½ï¼ä¹å°±æ¯ä¸ä¸ªå表 |
| | | temp=[] |
| | | temp.append(list[2]) #设å¤ç¼å· |
| | | temp.append(list[12]) #䏿¥æ¶é´ |
| | | temp.append(list[11]) #å½å±æ¶é´ |
| | | temp.append(list[6]) #飿ºçµæµ 6 |
| | | temp.append(list[7]) #ååå¨çµæµ 7 |
| | | temp.append(list[4]) #è¿æ²¹çæµåº¦å¼ |
| | | temp.append(list[5]) #ææ²¹çæµåº¦å¼ |
| | | |
| | | print(temp) |
| | | return temp |
| | | |
| | | |
| | | #å°å表åå
¥åéæ°æ®è¡¨ä¸ |
| | | def fd_write_to_SQL(list,con): |
| | | data = pd.DataFrame(list,columns=['MV_Stat_Code','MV_Create_Time','MV_Data_Time','MV_Fan_Electricity','MV_Purifier_Electricity','MV_Fume_Concentration','MV_Fume_Concentration2']) |
| | | print("åå
¥åæ°æ°æ®è¡¨,DateFrame为ï¼") |
| | | print(data) |
| | | |
| | | # test3 è¦åå
¥çæ°æ®è¡¨ï¼è¿æ ·åçè¯è¦æå卿°æ®åºå»ºå¥½è¡¨ |
| | | data.to_sql(name="fd_t_minutevalue", con=con, if_exists="append",index=False,index_label=False) |
| | | |
| | | print("åéæ°æ®è¡¨åå
¥å®æ!") |
| | | |
| | | #转å ååå
¥fd_t_minbuteè¡¨ä¸ |
| | | def fd_t_minbute(list,con): #ä¸é¡µçæ°æ®è®°å½ ï¼conä¸ºæ°æ®åºçå»ºç« |
| | | staging=[] #ä¿å转æ¢åçå表 |
| | | for item in list: |
| | | #æåéè¦çå¼ï¼å¹¶æ·»å å
¶ä»å段 |
| | | staging.append(refind_fd(item)) |
| | | print('åéæ°æ®è½¬ååï¼') |
| | | for i in staging: |
| | | print(i) |
| | | fd_write_to_SQL(staging,con) #å°å表åå
¥ea_t_decè¡¨ä¸ |
| | | |
| | | |
| | | #--------------------------------------------------------------------------------------------------------------é£å
¶å®¶ |
| | | def get_OnePage_teshu_shiqijia(url,count): |
| | | global ck |
| | | global list_temp #使ç¨å
¨å±åé |
| | | |
| | | |
| | | list_temp.clear() #æ¸
空临æ¶è¡¨ |
| | | headers = { |
| | | # æ¤å¤æ³¨æcookieï¼è¦èªå·±æå |
| | | "Cookie":ck, |
| | | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36", |
| | | } |
| | | r = requests.get(url=url, headers=headers, verify=False).text |
| | | soup = bs(r,'html.parser') |
| | | |
| | | list=[] #å建å表æ¥ä¿åç»æ |
| | | |
| | | tags = soup.find_all("tr") # å表ææè¡ |
| | | for tag in tags: # æ¯ä¸ªtagæ¯ä¸è¡ |
| | | count=count+1 |
| | | element = tag.text # è·å<tr>æ ç¾å
ææææ¬ä¿¡æ¯ |
| | | element = element.strip() # å°å符串é¦å°¾ç©ºæ ¼å»é¤ |
| | | list1 = element.split(); # ä»¥ç©ºæ ¼ä¸ºåéå°å符串å为å表 |
| | | |
| | | del (list1[-2:]) #å表æå两个å
ç´ ä¸éè¦,å é¤ |
| | | print('å é¤ç¹æ®çå两个') |
| | | print(list1) |
| | | |
| | | str_temp1=list1[4]+list1[5] #第5,6å并两个å
ç´ ä¸ºä¸ä¸ª |
| | | print(str_temp1) |
| | | del list1[5] |
| | | list1[4]=str_temp1 |
| | | print("å
ç´ å并宿") |
| | | print(list1) |
| | | |
| | | str_temp2=list1[1]+list1[2] #第äºä¸å
ç´ å并宿 |
| | | del list1[2] |
| | | list1[1]=str_temp2 |
| | | |
| | | list.append(list1) |
| | | print("æç»è¡æ°æ®") |
| | | print(list1) |
| | | #list1.clear() |
| | | |
| | | #print(list) |
| | | list_data=[] |
| | | for i in list: #å·²å°å°¾æ¥ææ°æ®åå¹¶æå¹´ææ¥ æ¶åç§ |
| | | list_data.append(merge(i)) |
| | | del list_data[0] #å 餿å表头 |
| | | count=count-1 #å é¤äºè¡¨å¤´,æ»æ°æ®çè¡æ°åä¸ |
| | | #print(list_data) |
| | | #list_temp=remove_Duplicates_list(list_data)[:] #å°æææ°æ®å¤å¶ç»ä¸´æ¶å表list_temp æ¯å»é¤éå¤åçå表 |
| | | list_temp=list_data[:] |
| | | return count |
| | | |
| | | |
| | | def get_MorePages_teshu_shiqijia(url,page_num): |
| | | global sleeptime |
| | | global already_spider_datanum |
| | | urls=list_url(url,page_num) #å¾å°éè¦éåç页çurl |
| | | count_all=0 #ä¿åæ°æ®çæ»è¡æ° |
| | | list_all=[] #ä¿åç¬åçææçæ°æ® |
| | | page=1 |
| | | for i in urls: |
| | | count=0 |
| | | count_all=count_all+get_OnePage_teshu_shiqijia(i,count) |
| | | if len(list_temp)==0: #å¦æè¯¥é¡µä¸ºç©ºï¼å表示该页åé¢é½æ æ°æ® éåºå¾ªç¯ |
| | | print('åé¢é¡µæ°ä¸ºç©ºï¼ç¬å»ä¸ä¸ä¸ªåºéº') |
| | | break |
| | | list_all.extend(list_temp) #å°å表追å å°list_allä¸ |
| | | print("ç¬åäºç¬¬",page,"页") |
| | | page=page+1 |
| | | print("\n") |
| | | time.sleep(sleeptime) #é´é2ç§è¯·æ±ä¸æ¬¡ |
| | | |
| | | for j in list_all: |
| | | print(j) #æå°åè¡¨ä¸æ¯ä¸è¡ |
| | | print("æ»è¡æ°ä¸º:",count_all) |
| | | already_spider_datanum += count_all #å·²ç¬åæ°æ®çæ»å |
| | | return list_all |
| | | |
| | | |
| | | |
| | | #-------------------------------------------------------------------------------------------------------------ç¹æ®çurl |
| | | def get_OnePage_teshu(url,count): #æåä¸é¡µçæ°æ®,æ¾å
¥list_dataä¸.urls为è¦è®¿é®çç½é¡µå°å |
| | | global ck |
| | | global list_temp #使ç¨å
¨å±åé |
| | | |
| | | list_temp.clear() #æ¸
空临æ¶è¡¨ |
| | | headers = { |
| | | # æ¤å¤æ³¨æcookieï¼è¦èªå·±æå |
| | | "Cookie":ck, |
| | | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36", |
| | | } |
| | | r = requests.get(url=url, headers=headers, verify=False).text |
| | | soup = bs(r,'html.parser') |
| | | |
| | | list=[] #å建å表æ¥ä¿åç»æ |
| | | |
| | | tags = soup.find_all("tr") # å表ææè¡ |
| | | for tag in tags: # æ¯ä¸ªtagæ¯ä¸è¡ |
| | | count=count+1 |
| | | element = tag.text # è·å<tr>æ ç¾å
ææææ¬ä¿¡æ¯ |
| | | element = element.strip() # å°å符串é¦å°¾ç©ºæ ¼å»é¤ |
| | | list1 = element.split(); # ä»¥ç©ºæ ¼ä¸ºåéå°å符串å为å表 |
| | | |
| | | del (list1[-2:]) #å表æå两个å
ç´ ä¸éè¦,å é¤ |
| | | print('å é¤ç¹æ®çå两个') |
| | | print(list1) |
| | | list.append(list1) |
| | | #list1.clear() |
| | | |
| | | #print(list) |
| | | list_data=[] |
| | | for i in list: |
| | | list_data.append(merge(i)) #å°å°¾æ¥ææ°æ®åå¹¶æå¹´ææ¥ æ¶åç§ |
| | | del list_data[0] #å 餿å表头 |
| | | count=count-1 #å é¤äºè¡¨å¤´,æ»æ°æ®çè¡æ°åä¸ |
| | | #print(list_data) |
| | | #list_temp=remove_Duplicates_list(list_data)[:] #å°æææ°æ®å¤å¶ç»ä¸´æ¶å表list_temp æ¯å»é¤éå¤åçå表 |
| | | list_temp=list_data[:] |
| | | return count |
| | | |
| | | |
| | | def get_MorePages_teshu(url,page_num): #ç¬åæå®åºéºåçå¤é¡µæ°æ®,pge_num表示ç¬åçé¡µæ° |
| | | global sleeptime |
| | | global already_spider_datanum |
| | | urls=list_url(url,page_num) #å¾å°éè¦éåç页çurl è¿å该url对åºé¡µçææé¾æ¥å½¢å¼ï¼è¿åå¼ä¸ºå表 |
| | | count_all=0 #ä¿åæ°æ®çæ»è¡æ° |
| | | list_all=[] #ä¿åç¬åçææçæ°æ® |
| | | page=1 |
| | | for i in urls: |
| | | count=0 |
| | | count_all=count_all+get_OnePage_teshu(i,count) |
| | | if len(list_temp)==0: #å¦æè¯¥é¡µä¸ºç©ºï¼å表示该页åé¢é½æ æ°æ® éåºå¾ªç¯ |
| | | print('åé¢é¡µæ°ä¸ºç©ºï¼ç¬å»ä¸ä¸ä¸ªåºéº') |
| | | break |
| | | list_all.extend(list_temp) #å°å表追å å°list_allä¸ |
| | | print("ç¬åäºç¬¬",page,"页") |
| | | page=page+1 |
| | | print("\n") |
| | | time.sleep(sleeptime) #é´é2ç§è¯·æ±ä¸æ¬¡ |
| | | |
| | | for j in list_all: |
| | | print(j) #æå°åè¡¨ä¸æ¯ä¸è¡ |
| | | print("æ»è¡æ°ä¸º:",count_all) |
| | | already_spider_datanum += count_all #å·²ç¬åæ°æ®çæ»å |
| | | return list_all |
| | | |
| | | |
| | | def spilt_url_teshu(con,page,date_begin=month_begin,date_end=now_date): #å
å¯¹ç¹æ®çurlåå¤ç,åè¿æ»¤ |
| | | global already_spider_shopnum |
| | | urls=url_more() #è¿åæä»¶ä¸ææåºéºçurl,带æå¤§æ¾ç¤ºæ¡æ°100 ãurlsæ¯å表 |
| | | #print(urls) |
| | | teshu_url=[] |
| | | #'shop=%25E9%25A3%259F%25E5%2585%25B6%25E5%25AE%25B6' é£å
¶å®¶ |
| | | special_url=['shop=%25E4%25BA%25BA%25E7%2594%259F%25E4%25B8%2580%25E4%25B8%25B2','shop=%25E7%25BC%2598%25E5%25AE%25B6','shop=%25E4%25B8%25B0%25E8%258C%2582%25E7%2583%25A4%25E4%25B8%25B2','shop=%25E6%25B3%25B0%25E7%2585%258C%25E9%25B8%25A1','shop=%25E5%25B0%258F%25E9%2593%2581%25E5%2590%259B'] |
| | | |
| | | for url in urls: #éåææåºéºçurl |
| | | begin=url.find('&')+1 |
| | | end=url.rfind('&') |
| | | #print(begin,end) |
| | | #æ¾å°ç¹æ®çurl,è¿è¡ç¹æ®å¤ç |
| | | if url[begin:end] in special_url: |
| | | print('åç°ç¹æ®çï¼') |
| | | already_spider_shopnum += 1 #ç¬å»çåºéºæ°éå 1 |
| | | teshu_url.append(url) |
| | | #åå é¤ä¸ä¸ªå表å
ç´ |
| | | url_teshu=url_add_time(url,date_begin,date_end) #ç»ææurlå 䏿¥æ |
| | | list_to_MySql=get_MorePages_teshu(url_teshu,page) #åºéºçurl,ç¬åçé¡µæ° |
| | | # a=remove_Duplicates_list(list_to_MySql) |
| | | # print('\n') |
| | | # for item in a: |
| | | # print(item) |
| | | if len(list_to_MySql) == 0 : |
| | | print('该家åºéºæ¶é´æ®µæ æ°æ®ï¼å·²è·³è¿') |
| | | continue |
| | | has_remove_duplicates = remove_Duplicates_list(list_to_MySql) #å»é¤æä¸ªåºéºæå®é¡µæ°ææéå¤çæ°æ® |
| | | is_minutes_exceed_30(has_remove_duplicates,con) # å°æå®é¡µæ°çè®¾å¤æ
éæ°æ®åå
¥æ°æ®åºå¼å¸¸è¡¨ä¸ |
| | | isExceeding(has_remove_duplicates,con) # å°æå®é¡µæ°æ°æ®åå
¥æ°æ®åºè¶
æ è¡¨ä¸ |
| | | ea_t_dev(has_remove_duplicates,con) # å°æå®é¡µæ°æ°æ®åå
¥æ°æ®åºè®¾å¤ä¿¡æ¯è¡¨ä¸ |
| | | fd_t_minbute(has_remove_duplicates,con) #å°æå®é¡µæ°æ°æ®åå
¥æ°æ®åºåéæ°æ®è¡¨ä¸ |
| | | |
| | | list_to_MySql.clear() |
| | | if url[begin:end]=='shop=%25E9%25A3%259F%25E5%2585%25B6%25E5%25AE%25B6': #é£å
¶å®¶ |
| | | print('åç°ç¹æ®çï¼') |
| | | already_spider_shopnum += 1 #ç¬å»çåºéºæ°éå 1 |
| | | teshu_url.append(url) |
| | | #åå é¤ä¸ä¸ªå表å
ç´ |
| | | url_teshu=url_add_time(url,date_begin,date_end) #ç»ææurlå 䏿¥æ |
| | | list_to_MySql=get_MorePages_teshu_shiqijia(url_teshu,page) #åºéºçurl,ç¬åçé¡µæ° |
| | | # b=remove_Duplicates_list(list_to_MySql) |
| | | # for item in b: |
| | | # print(item) |
| | | if len(list_to_MySql) == 0 : |
| | | print('该家åºéºæ¶é´æ®µæ æ°æ®ï¼å·²è·³è¿') |
| | | continue |
| | | has_remove_duplicates = remove_Duplicates_list(list_to_MySql) #å»é¤æä¸ªåºéºæå®é¡µæ°ææéå¤çæ°æ® |
| | | is_minutes_exceed_30(has_remove_duplicates,con) # å°æå®é¡µæ°çè®¾å¤æ
éæ°æ®åå
¥æ°æ®åºå¼å¸¸è¡¨ä¸ |
| | | isExceeding(has_remove_duplicates,con) # å°æå®é¡µæ°æ°æ®åå
¥æ°æ®åºè¶
æ è¡¨ä¸ |
| | | ea_t_dev(has_remove_duplicates,con) # å°æå®é¡µæ°æ°æ®åå
¥æ°æ®åºè®¾å¤ä¿¡æ¯è¡¨ä¸ |
| | | fd_t_minbute(has_remove_duplicates,con) #å°æå®é¡µæ°æ°æ®åå
¥æ°æ®åºåéæ°æ®è¡¨ä¸ |
| | | |
| | | list_to_MySql.clear() |
| | | for t in teshu_url: #ä»urls表ä¸å é¤ç¹æ®ç |
| | | urls.remove(t) |
| | | print(len(urls)) |
| | | return urls |
| | | |
| | | #------------------------------------------------------------------------------------------------------------- |
| | | |
| | | |
| | | def spider_all(con,page,date_begin=month_begin,date_end=now_date): #ç¬åæä»¶ä¸ææåºéº(å
æ¬ç¹æ®çurlåºéº) æ°æ®åºè¿æ¥å¯¹è±¡ ,è¦ç¬åç页æ°,å¼å§æ¶é´,ç»ææ¶é´ |
| | | global already_spider_shopnum |
| | | url_all=[] |
| | | #urls=url_more() #è¿åæä»¶ä¸ææåºéºçurl,带æå¤§æ¾ç¤ºæ¡æ°100 |
| | | #åä¸ç¬¦åçå
å¤ç |
| | | urls=spilt_url_teshu(con,page,date_begin,date_end) |
| | | |
| | | for url in urls: #ç»ææurlå 䏿¥æ |
| | | url_all.append(url_add_time(url,date_begin,date_end)) |
| | | |
| | | for i in url_all: #æå°æç»çurl |
| | | print(i) |
| | | |
| | | for j in url_all: #æ ¹æ®ææurlåå
¥æ°æ®åº |
| | | list_to_MySql=get_MorePages(j,page) #åºéºçurl,ç¬åçé¡µæ° |
| | | already_spider_shopnum += 1 #ç¬å»çåºéºæ°éå 1 |
| | | # a=remove_Duplicates_list(list_to_MySql) |
| | | # print('\n\n') |
| | | # for item in a: |
| | | # print(item) |
| | | if len(list_to_MySql) == 0 : |
| | | print('该家åºéºæ¶é´æ®µæ æ°æ®ï¼å·²è·³è¿') |
| | | continue |
| | | has_remove_duplicates = remove_Duplicates_list(list_to_MySql) #å»é¤æä¸ªåºéºæå®é¡µæ°ææéå¤çæ°æ® |
| | | is_minutes_exceed_30(has_remove_duplicates,con) # å°æå®é¡µæ°çè®¾å¤æ
éæ°æ®åå
¥æ°æ®åºå¼å¸¸è¡¨ä¸ |
| | | isExceeding(has_remove_duplicates,con) # å°æå®é¡µæ°æ°æ®åå
¥æ°æ®åºè¶
æ è¡¨ä¸ åå
¥å¼å¸¸è¡¨ä¸ |
| | | ea_t_dev(has_remove_duplicates,con) # å°æå®é¡µæ°æ°æ®åå
¥æ°æ®åºè®¾å¤ä¿¡æ¯è¡¨ä¸ |
| | | fd_t_minbute(has_remove_duplicates,con) #å°æå®é¡µæ°æ°æ®åå
¥æ°æ®åºåéæ°æ®è¡¨ä¸ |
| | | |
| | | list_to_MySql.clear() |
| | | |
| | | def back_cookie(): #仿件ä¸è¯»åcookie |
| | | global ck |
| | | with open("D:\\z\\workplace\\cookie.txt",'r') as fp: |
| | | ck=fp.read() |
| | | |
| | | |
| | | def write_Sql(list,con): #å°ç½ç«æ°æ®åå
¥æ°æ®åº |
| | | data = pd.DataFrame(list,columns=['provider','shop_name','equipment_number','equipment_name','smoke_push_density','smoke_pop_density','wind_turbine','purifier','level','alarm_required','alarm_triggered','attribution_time','reporting_time','data_time']) |
| | | print("\n\n") |
| | | print(data) |
| | | # engine = create_engine("mysql+mysqlconnector://root:1234@localhost:3306/qianduan_sql?charset=utf8") |
| | | # con = engine.connect() |
| | | |
| | | # test3 è¦åå
¥çæ°æ®è¡¨ï¼è¿æ ·åçè¯è¦æå卿°æ®åºå»ºå¥½è¡¨ |
| | | data.to_sql(name="ed_data", con=con, if_exists="append",index=False,index_label=False) |
| | | # con.close() |
| | | print("åå
¥å®æ!") |
| | | |
| | | ck="" #ä¿åcookie |
| | | shopnum=0 #æä»¶ä¸åºéºæ»æ° |
| | | already_spider_shopnum=0 #å·²ç¬å»çåºéºæ°é |
| | | already_spider_datanum=0 #å·²ç¬å»çæ°æ®æ¡æ° |
| | | sleeptime=4 |
| | | |
| | | Key_period_noon_begin = datetime.strptime('10:00',"%H:%M") #ä¸åéç¹æ¶æ®µ |
| | | Key_period_noon_end = datetime.strptime('14:00',"%H:%M") |
| | | |
| | | |
| | | Key_period_night_begin = datetime.strptime('17:00',"%H:%M") #æä¸éç¹æ¶æ®µ |
| | | Key_period_night_end = datetime.strptime('21:00',"%H:%M") |
| | | |
| | | def pass_login(): |
| | | global con_read |
| | | #"mysql+mysqlconnector://root:1234@localhost:3306/qianduan_sql?charset=utf8" |
| | | #engine = create_engine("mysql+mysqlconnector://root:1234@localhost:3306/qianduan_sql?charset=utf8") |
| | | engine = create_engine("mysql+mysqlconnector://root:1234@localhost:3306/qianduan_sql?charset=utf8") |
| | | con = engine.connect() |
| | | |
| | | back_cookie() # 仿件ä¸è¯»åcookie |
| | | |
| | | #ç¬åææåºéº 并计ç®èæ¶ |
| | | start_time=time.time() |
| | | |
| | | spider_all(con,55,'2023-06-01','2023-06-30') #ç¬åæä»¶ä¸ææçåºéºå |
| | | |
| | | end_time=time.time() |
| | | # å
³éæ°æ®åºè¿æ¥ |
| | | con_read.close() |
| | | con.close() |
| | | print("åå
¥å®æ!") |
| | | print("设置ç¬åçæ¶é´é´é为",sleeptime,"ç§") |
| | | print("å
±æ",shopnum,"å®¶","å·²ç¬å",already_spider_shopnum,"å®¶") |
| | | print("å
±ç¬å",already_spider_datanum,"æ¡è®°å½") |
| | | print("å
±èæ¶:{:.2f}ç§".format(end_time-start_time)) |
| | | |
| | | engine = create_engine("mysql+mysqlconnector://root:1234@localhost:3306/qianduan_sql?charset=utf8") |
| | | # ä¸é¨è¯»å设å¤ä¿¡æ¯è¡¨ |
| | | con_read = engine.connect() |
| | | pass_login() |