盖得化工4——多线程+余数

 

sklearn实战-乳腺癌细胞数据挖掘(博主亲自录制视频)

# -*- coding: utf-8 -*-
"""
Created on Tue May 17 16:26:31 2016
采集下来excel文件小于2kb的有问题

@author: Administrator
"""

  
import requests,bs4,csv,time,random,os,threading
#一次采集个数 divident=20 #存放所有二级网址 fileName='combinedFile.csv' #存放二级网址目录 bad_urls=[] site_hubei="http://china.guidechem.com/suppliers/list_catid-21_area-%E6%B9%96%E5%8C%97" site_guangdong="http://china.guidechem.com/suppliers/list_catid-21_area-广东" site_shanghai="http://china.guidechem.com/suppliers/list_catid-21_area-%E4%B8%8A%E6%B5%B7" site_shanxi="http://china.guidechem.com/suppliers/list_catid-21_area-陕西" site_chongqing="http://china.guidechem.com/suppliers/list_catid-21_area-重庆" site_jiangsu="http://china.guidechem.com/suppliers/list_catid-21_area-江苏" pages_hubei=31 pages_guangdong=21 pages_shanghai=34 pages_shanxi=15 pages_chongqing=2 pages_jiangsu=67 start_page=0 def Get_sites(site,pages): list_pages=[] for page in range(1,pages+1): thePage=site+"-"+"p"+str(page)+".html" list_pages.append(thePage) return list_pages def Get_company_name(elems,i): elems_company_name=elems[i].select(".dblue") if len(elems_company_name)==0: #如果找不到元素,则空起 company_name="" return company_name company_name=elems_company_name[0].text return company_name def Get_main_product(elems,i): elems_main_product=elems[i].select("li") if len(elems_main_product)==0: #如果找不到元素,则空起 main_product="" return main_product main_product=elems_main_product[1].text.strip("\r\n") return main_product def Get_phone_address(elems,i): elems_contact=elems[i].select(".site_l") content_contact=elems_contact[0].text content_contact1=content_contact.strip("\r\n\r\n\t\r\n") content_contact2=content_contact1.strip("\r\n") list_content_contact=content_contact2.split("\r\n\r\n") #有时候信息会缺失,用正则表达式筛选text内容 if len(list_content_contact)==2: phone=list_content_contact[0] address=list_content_contact[1] if len(list_content_contact)==1: content=list_content_contact[0] if "地址" in content: address=content phone="" if "电话" in content: phone=content address="" phone_address=(phone,address) return phone_address #获取每一页20个公司信息存储在list_rows_information def Get_page_information(url): #每一页20个公司信息存储在list_rows_information里面 list_rows_information=[] res=requests.get(url) #time.sleep(2) soup=bs4.BeautifulSoup(res.text,"lxml") #time.sleep(2) #综合信息 elems=soup.select(".clist_list_content_r") num=len(elems) for i in range(num): try: #公司名称 company_name=Get_company_name(elems,i) #主要产品 main_product=Get_main_product(elems,i) #联系方式 phone_address=Get_phone_address(elems,i) phone=phone_address[0] address=phone_address[1] list_rows_information.append([company_name,main_product,phone,address]) except: print("error at:",i) continue return list_rows_information #把一页内容写入csv文档 ,list_tableContent为二维列表[[a],[b],[c]] def Write_table_to_csv(url): list_tableContent=Get_page_information(url) fileName=os.path.splitext(url)[0][-3:]+".csv" # 1.csv ''' fileName=os.path.splitext(url)[0][-3:]+".csv" fileName Out[27]: 'p12.csv' ''' #对列表格式修改,字符串写入的格式不对 file=open(fileName,'w',newline='') writer1=csv.writer(file) writer1.writerows(list_tableContent) file.close() #写入所有文件 def Write_allTables_to_csvs(list_pages): for i in range(start_page,pages_shanghai): try: Write_table_to_csv(i) time.sleep(random.randint(30,31)) except: print("error at:",i) continue #step表示要采集多少次;divident表示每次采集几个 def Step(urls_list,divident): step=len(urls_list)/divident step=int(step) return step #获取采集网页余数 def Left(urls_list): step=Step(urls_list,divident) left=len(urls_list)-step*divident return left #采集某范围网址的公司数据 def download_range(start,end): urls_list_range1=list_pages[start:end] for url in urls_list_range1: try: Write_table_to_csv(url) except: bad_urls.append(url) continue #print("well Done") #主函数 list_pages=Get_sites(site_jiangsu,pages_jiangsu) #采集要多少线程 step=Step(list_pages,divident) #网页余数 left=Left(list_pages) #生产所有csv文件,单线程采集 #Write_allTables_to_csvs(list_pages) downloadThreads = [] # a list of all the Thread objects for i in range(0, len(list_pages), step): # loops 14 times, creates 14 threads downloadThread = threading.Thread(target=download_range, args=(i, i +step)) downloadThreads.append(downloadThread) downloadThread.start() #采集余数 download_range(-left,0) # Wait for all threads to end. for downloadThread in downloadThreads: downloadThread.join() print('Done.') ''' 测试 #downloadThread = threading.Thread(target=download_range, args=(10, 12)) #downloadThread.start() downloadThread = threading.Thread(target=download_range, args=(12, 14)) downloadThread.start() downloadThread = threading.Thread(target=download_range, args=(14, 16)) downloadThread.start() i=3 res=requests.get(list_pages[i]) soup=bs4.BeautifulSoup(res.text,"lxml") elems=soup.select(".clist_list_content_r") #联系方式 elems_contact=elems[2].select(".site_l") content_contact=elems_contact[0].text content_contact1=content_contact.strip("\r\n\r\n\t\r\n") content_contact2=content_contact1.strip("\r\n") list_content_contact=content_contact2.split("\r\n\r\n") #有时候信息会缺失,用正则表达式筛选text内容 if len(list_content_contact)==2: phone=list_content_contact[0] address=list_content_contact[1] if len(list_content_contact)==1: content=list_content_contact[0] if "地址" in content: address=content phone=[] if "电话" in content: phone=content address=[] '''

 

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值