src/Crawling.py
@@ -120,7 +120,8 @@
        count_all=count_all+get_OnePage(i,count)
        if len(list_temp)==0:        #如果该页为空,则表示该页后面都无数据  退出循环
            print('后面页数为空,爬去下一个店铺')
            break                    #退出循环
            break
                           #退出循环
        list_all.extend(list_temp)   #将一页数据列表追加到list_all中
        print("爬取了第",page,"页")
        page=page+1
@@ -213,7 +214,7 @@
    data = pd.DataFrame(list,columns=['dev_id','exception','exception_type','region','begin_time','end_time'])
    print("\n\n")
    print(data)
    # engine = create_engine("mysql+mysqlconnector://root:1234@localhost:3306/fume?charset=utf8")
    # engine = create_engine("mysql+pymysql://root:1234@localhost:3306/fume?charset=utf8")
    # con = engine.connect()
    # test3 要写入的数据表,这样写的话要提前在数据库建好表
@@ -380,7 +381,7 @@
    data = pd.DataFrame(list,columns=['MV_Stat_Code','MV_Create_Time','MV_Data_Time','MV_Fan_Electricity','MV_Purifier_Electricity','MV_Fume_Concentration','MV_Fume_Concentration2'])
    print("\n\n")
    print(data)
    #engine = create_engine("mysql+mysqlconnector://root:1234@localhost:3306/fume?charset=utf8")
    #engine = create_engine("mysql+pymysql://root:1234@localhost:3306/fume?charset=utf8")
    #con = engine.connect()
    # test3 要写入的数据表,这样写的话要提前在数据库建好表
@@ -512,7 +513,7 @@
def dev_info_data_if_exisitd(list,con):  #list为爬取某家店铺指定页数转换后的数据
    global con_read
    #创建第二个数据库连接
    # engine = create_engine("mysql+mysqlconnector://root:1234@localhost:3306/fume?charset=utf8")
    # engine = create_engine("mysql+pymysql://root:1234@localhost:3306/fume?charset=utf8")
    # con_read = engine.connect()
    df = pd.read_sql('SELECT DI_Code,DI_Name,DI_Supplier FROM ea_t_device_info',con=con_read)   #从设备信息表中读取设备编号,店铺名,供应商字段的数据。返回值是DateFrame类型
@@ -604,11 +605,7 @@
    global ck
    global list_temp    #使用全局变量
    list_temp.clear()  #清空临时表
    # session.headers = {
    #     # 此处注意cookie,要自己抓取
    #     # "Cookie":ck,
    #     "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36",
    # }
    r = session.get(url, verify=False).text
    soup = bs(r,'html.parser')
@@ -686,11 +683,7 @@
    global list_temp    #使用全局变量
    list_temp.clear()  #清空临时表
    # session.headers = {
    #     # 此处注意cookie,要自己抓取
    #     # "Cookie":ck,
    #     "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36",
    # }
    r = session.get(url,  verify=False).text
    soup = bs(r,'html.parser')
@@ -707,7 +700,6 @@
        print('删除特殊的后两个')
        print(list1)
        list.append(list1) 
        #list1.clear()
    #print(list) 
    list_data=[]  
@@ -753,36 +745,27 @@
    urls=url_more()   #返回文件中所有店铺的url,带最大显示条数100 。urls是列表
    #print(urls)
    teshu_url=[]
    #'shop=%25E9%25A3%259F%25E5%2585%25B6%25E5%25AE%25B6'  食其家
    special_url=['shop=%25E4%25BA%25BA%25E7%2594%259F%25E4%25B8%2580%25E4%25B8%25B2','shop=%25E7%25BC%2598%25E5%25AE%25B6','shop=%25E4%25B8%25B0%25E8%258C%2582%25E7%2583%25A4%25E4%25B8%25B2','shop=%25E6%25B3%25B0%25E7%2585%258C%25E9%25B8%25A1','shop=%25E5%25B0%258F%25E9%2593%2581%25E5%2590%259B']
    for url in urls:          #遍历所有店铺的url
        begin=url.find('&')+1
        end=url.rfind('&')
        #print(begin,end)
        #找到特殊的url,进行特殊处理
        if url[begin:end] in special_url:   
            print('发现特殊的!')
            already_spider_shopnum += 1   #爬去的店铺数量加1
            already_spider_shopnum += 1   # 爬去的店铺数量加1
            teshu_url.append(url)
            #再删除一个列表元素
            url_teshu=url_add_time(url,date_begin,date_end)  #给所有url加上日期
            list_to_MySql=get_MorePages_teshu(url_teshu,page)    #店铺的url,爬取的页数
            # a=remove_Duplicates_list(list_to_MySql)
            # print('\n')
            # for item in a:
            #     print(item)
            url_teshu=url_add_time(url,date_begin,date_end)  # 给所有url加上日期
            list_to_MySql=get_MorePages_teshu(url_teshu,page)    # 店铺的url,爬取的页数
            if len(list_to_MySql) == 0 :
                print('该家店铺时间段无数据,已跳过')
                continue
            has_remove_duplicates = remove_Duplicates_list(list_to_MySql)   #去除某个店铺指定页数所有重复的数据
            for item in has_remove_duplicates:
                all_data.append(item)
            # is_minutes_exceed_30(has_remove_duplicates,con)      # 将指定页数的设备故障数据写入数据库异常表中
            # isExceeding(has_remove_duplicates,con)               # 将指定页数数据写入数据库超标表中
            # ea_t_dev(has_remove_duplicates,con)                  # 将指定页数数据写入数据库设备信息表中
            # fd_t_minbute(has_remove_duplicates,con)              #将指定页数数据写入数据库分钟数据表中
            list_to_MySql.clear()
        if url[begin:end]=='shop=%25E9%25A3%259F%25E5%2585%25B6%25E5%25AE%25B6':   #食其家
            print('发现特殊的!')
@@ -800,11 +783,7 @@
            has_remove_duplicates = remove_Duplicates_list(list_to_MySql)   #去除某个店铺指定页数所有重复的数据
            for item in has_remove_duplicates:
                all_data.append(item)
            # is_minutes_exceed_30(has_remove_duplicates,con)      # 将指定页数的设备故障数据写入数据库异常表中
            # isExceeding(has_remove_duplicates,con)               # 将指定页数数据写入数据库超标表中
            # ea_t_dev(has_remove_duplicates,con)                  # 将指定页数数据写入数据库设备信息表中
            # fd_t_minbute(has_remove_duplicates,con)              #将指定页数数据写入数据库分钟数据表中
            list_to_MySql.clear()
    for t in teshu_url:     #从urls表中删除特殊的
        urls.remove(t)
@@ -874,7 +853,12 @@
Key_period_night_begin = datetime.strptime('17:00',"%H:%M")   #晚上重点时段
Key_period_night_end = datetime.strptime('21:00',"%H:%M")
engine = create_engine("mysql+mysqlconnector://root:1234@localhost:3306/fume?charset=utf8")
# e1ngine = create_engine("mysql+pymysql://fume:fume_feiyu2023@localhost:3306/fume?charset=utf8")
engine = create_engine("mysql+pymysql://fumeRemote:feiyu2023@114.215.109.124:3306/fume?charset=utf8")
# e1ngine = create_engine("mysql+pymysql:/fumeRemote:feiyu2023@114.215.109.124:3306/fume?charset=utf8")
# 专门读取设备信息表
con_read = engine.connect()
@@ -903,9 +887,10 @@
    # 初始化session
    session = ses
    webshops = shops[:]
    print(f'获取数据的时间区间为:{beginTime}-{endTime}')
    print('传入的店铺名称为:',shops)
   
    engine = create_engine("mysql+mysqlconnector://root:1234@localhost:3306/fume?charset=utf8")
    engine = create_engine("mysql+pymysql://fumeRemote:feiyu2023@114.215.109.124:3306/fume?charset=utf8")
    con = engine.connect()
    # back_cookie()   # 从文件中读取cookie