| | |
| | | count_all=count_all+get_OnePage(i,count) |
| | | if len(list_temp)==0: #如果该页为空,则表示该页后面都无数据 退出循环 |
| | | print('后面页数为空,爬去下一个店铺') |
| | | break #退出循环 |
| | | break |
| | | #退出循环 |
| | | list_all.extend(list_temp) #将一页数据列表追加到list_all中 |
| | | print("爬取了第",page,"页") |
| | | page=page+1 |
| | |
| | | data = pd.DataFrame(list,columns=['dev_id','exception','exception_type','region','begin_time','end_time']) |
| | | print("\n\n") |
| | | print(data) |
| | | # engine = create_engine("mysql+mysqlconnector://root:1234@localhost:3306/fume?charset=utf8") |
| | | # engine = create_engine("mysql+pymysql://root:1234@localhost:3306/fume?charset=utf8") |
| | | # con = engine.connect() |
| | | |
| | | # test3 要写入的数据表,这样写的话要提前在数据库建好表 |
| | |
| | | data = pd.DataFrame(list,columns=['MV_Stat_Code','MV_Create_Time','MV_Data_Time','MV_Fan_Electricity','MV_Purifier_Electricity','MV_Fume_Concentration','MV_Fume_Concentration2']) |
| | | print("\n\n") |
| | | print(data) |
| | | #engine = create_engine("mysql+mysqlconnector://root:1234@localhost:3306/fume?charset=utf8") |
| | | #engine = create_engine("mysql+pymysql://root:1234@localhost:3306/fume?charset=utf8") |
| | | #con = engine.connect() |
| | | |
| | | # test3 要写入的数据表,这样写的话要提前在数据库建好表 |
| | |
| | | def dev_info_data_if_exisitd(list,con): #list为爬取某家店铺指定页数转换后的数据 |
| | | global con_read |
| | | #创建第二个数据库连接 |
| | | # engine = create_engine("mysql+mysqlconnector://root:1234@localhost:3306/fume?charset=utf8") |
| | | # engine = create_engine("mysql+pymysql://root:1234@localhost:3306/fume?charset=utf8") |
| | | # con_read = engine.connect() |
| | | |
| | | df = pd.read_sql('SELECT DI_Code,DI_Name,DI_Supplier FROM ea_t_device_info',con=con_read) #从设备信息表中读取设备编号,店铺名,供应商字段的数据。返回值是DateFrame类型 |
| | |
| | | global ck |
| | | global list_temp #使用全局变量 |
| | | list_temp.clear() #清空临时表 |
| | | # session.headers = { |
| | | # # 此处注意cookie,要自己抓取 |
| | | # # "Cookie":ck, |
| | | # "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36", |
| | | # } |
| | | |
| | | r = session.get(url, verify=False).text |
| | | soup = bs(r,'html.parser') |
| | | |
| | |
| | | global list_temp #使用全局变量 |
| | | |
| | | list_temp.clear() #清空临时表 |
| | | # session.headers = { |
| | | # # 此处注意cookie,要自己抓取 |
| | | # # "Cookie":ck, |
| | | # "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36", |
| | | # } |
| | | |
| | | r = session.get(url, verify=False).text |
| | | soup = bs(r,'html.parser') |
| | | |
| | |
| | | print('删除特殊的后两个') |
| | | print(list1) |
| | | list.append(list1) |
| | | #list1.clear() |
| | | |
| | | #print(list) |
| | | list_data=[] |
| | |
| | | urls=url_more() #返回文件中所有店铺的url,带最大显示条数100 。urls是列表 |
| | | #print(urls) |
| | | teshu_url=[] |
| | | #'shop=%25E9%25A3%259F%25E5%2585%25B6%25E5%25AE%25B6' 食其家 |
| | | special_url=['shop=%25E4%25BA%25BA%25E7%2594%259F%25E4%25B8%2580%25E4%25B8%25B2','shop=%25E7%25BC%2598%25E5%25AE%25B6','shop=%25E4%25B8%25B0%25E8%258C%2582%25E7%2583%25A4%25E4%25B8%25B2','shop=%25E6%25B3%25B0%25E7%2585%258C%25E9%25B8%25A1','shop=%25E5%25B0%258F%25E9%2593%2581%25E5%2590%259B'] |
| | | |
| | | for url in urls: #遍历所有店铺的url |
| | | begin=url.find('&')+1 |
| | | end=url.rfind('&') |
| | | #print(begin,end) |
| | | #找到特殊的url,进行特殊处理 |
| | | if url[begin:end] in special_url: |
| | | print('发现特殊的!') |
| | | already_spider_shopnum += 1 #爬去的店铺数量加1 |
| | | already_spider_shopnum += 1 # 爬去的店铺数量加1 |
| | | teshu_url.append(url) |
| | | #再删除一个列表元素 |
| | | url_teshu=url_add_time(url,date_begin,date_end) #给所有url加上日期 |
| | | list_to_MySql=get_MorePages_teshu(url_teshu,page) #店铺的url,爬取的页数 |
| | | # a=remove_Duplicates_list(list_to_MySql) |
| | | # print('\n') |
| | | # for item in a: |
| | | # print(item) |
| | | url_teshu=url_add_time(url,date_begin,date_end) # 给所有url加上日期 |
| | | list_to_MySql=get_MorePages_teshu(url_teshu,page) # 店铺的url,爬取的页数 |
| | | |
| | | if len(list_to_MySql) == 0 : |
| | | print('该家店铺时间段无数据,已跳过') |
| | | continue |
| | | has_remove_duplicates = remove_Duplicates_list(list_to_MySql) #去除某个店铺指定页数所有重复的数据 |
| | | for item in has_remove_duplicates: |
| | | all_data.append(item) |
| | | # is_minutes_exceed_30(has_remove_duplicates,con) # 将指定页数的设备故障数据写入数据库异常表中 |
| | | # isExceeding(has_remove_duplicates,con) # 将指定页数数据写入数据库超标表中 |
| | | # ea_t_dev(has_remove_duplicates,con) # 将指定页数数据写入数据库设备信息表中 |
| | | # fd_t_minbute(has_remove_duplicates,con) #将指定页数数据写入数据库分钟数据表中 |
| | | |
| | | |
| | | list_to_MySql.clear() |
| | | if url[begin:end]=='shop=%25E9%25A3%259F%25E5%2585%25B6%25E5%25AE%25B6': #食其家 |
| | | print('发现特殊的!') |
| | |
| | | has_remove_duplicates = remove_Duplicates_list(list_to_MySql) #去除某个店铺指定页数所有重复的数据 |
| | | for item in has_remove_duplicates: |
| | | all_data.append(item) |
| | | # is_minutes_exceed_30(has_remove_duplicates,con) # 将指定页数的设备故障数据写入数据库异常表中 |
| | | # isExceeding(has_remove_duplicates,con) # 将指定页数数据写入数据库超标表中 |
| | | # ea_t_dev(has_remove_duplicates,con) # 将指定页数数据写入数据库设备信息表中 |
| | | # fd_t_minbute(has_remove_duplicates,con) #将指定页数数据写入数据库分钟数据表中 |
| | | |
| | | |
| | | list_to_MySql.clear() |
| | | for t in teshu_url: #从urls表中删除特殊的 |
| | | urls.remove(t) |
| | |
| | | Key_period_night_begin = datetime.strptime('17:00',"%H:%M") #晚上重点时段 |
| | | Key_period_night_end = datetime.strptime('21:00',"%H:%M") |
| | | |
| | | engine = create_engine("mysql+mysqlconnector://root:1234@localhost:3306/fume?charset=utf8") |
| | | # e1ngine = create_engine("mysql+pymysql://fume:fume_feiyu2023@localhost:3306/fume?charset=utf8") |
| | | engine = create_engine("mysql+pymysql://fumeRemote:feiyu2023@114.215.109.124:3306/fume?charset=utf8") |
| | | |
| | | |
| | | # e1ngine = create_engine("mysql+pymysql:/fumeRemote:feiyu2023@114.215.109.124:3306/fume?charset=utf8") |
| | | |
| | | # 专门读取设备信息表 |
| | | con_read = engine.connect() |
| | | |
| | |
| | | # 初始化session |
| | | session = ses |
| | | webshops = shops[:] |
| | | print(f'获取数据的时间区间为:{beginTime}-{endTime}') |
| | | print('传入的店铺名称为:',shops) |
| | | |
| | | engine = create_engine("mysql+mysqlconnector://root:1234@localhost:3306/fume?charset=utf8") |
| | | engine = create_engine("mysql+pymysql://fumeRemote:feiyu2023@114.215.109.124:3306/fume?charset=utf8") |
| | | con = engine.connect() |
| | | |
| | | # back_cookie() # 从文件中读取cookie |