盖得化工4——多线程+余数

最新推荐文章于 2024-08-29 21:44:40 发布

weixin_33958585

最新推荐文章于 2024-08-29 21:44:40 发布

阅读量154

点赞数

文章标签：爬虫人工智能 python

sklearn实战-乳腺癌细胞数据挖掘（博主亲自录制视频）

https://study.163.com/course/introduction.htm?courseId=1005269003&utm_campaign=commission&utm_source=cp-400000000398149&utm_medium=share

# -*- coding: utf-8 -*-
"""
Created on Tue May 17 16:26:31 2016
采集下来excel文件小于2kb的有问题

@author: Administrator
"""

  
import requests,bs4,csv,time,random,os,threading
#一次采集个数
divident=20  
#存放所有二级网址
fileName='combinedFile.csv'
#存放二级网址目录
bad_urls=[]
site_hubei="http://china.guidechem.com/suppliers/list_catid-21_area-%E6%B9%96%E5%8C%97"
site_guangdong="http://china.guidechem.com/suppliers/list_catid-21_area-广东"
site_shanghai="http://china.guidechem.com/suppliers/list_catid-21_area-%E4%B8%8A%E6%B5%B7"
site_shanxi="http://china.guidechem.com/suppliers/list_catid-21_area-陕西"
site_chongqing="http://china.guidechem.com/suppliers/list_catid-21_area-重庆"
site_jiangsu="http://china.guidechem.com/suppliers/list_catid-21_area-江苏"
pages_hubei=31
pages_guangdong=21
pages_shanghai=34
pages_shanxi=15
pages_chongqing=2
pages_jiangsu=67
start_page=0
 
def Get_sites(site,pages):
    list_pages=[]
    for page in range(1,pages+1):
        thePage=site+"-"+"p"+str(page)+".html"
        list_pages.append(thePage)
     
    return list_pages


def Get_company_name(elems,i):
    elems_company_name=elems[i].select(".dblue")
    if len(elems_company_name)==0:   #如果找不到元素，则空起
        company_name=""
        return company_name
    company_name=elems_company_name[0].text
    return company_name

def Get_main_product(elems,i):
    elems_main_product=elems[i].select("li")
    if len(elems_main_product)==0:   #如果找不到元素，则空起
        main_product=""
        return main_product
    main_product=elems_main_product[1].text.strip("\r\n")
    return main_product
    
def Get_phone_address(elems,i):
    elems_contact=elems[i].select(".site_l")
    content_contact=elems_contact[0].text
    content_contact1=content_contact.strip("\r\n\r\n\t\r\n")
    content_contact2=content_contact1.strip("\r\n")
    list_content_contact=content_contact2.split("\r\n\r\n")
    #有时候信息会缺失，用正则表达式筛选text内容
    if len(list_content_contact)==2:
        phone=list_content_contact[0]
        address=list_content_contact[1]
    if len(list_content_contact)==1:
        content=list_content_contact[0]
        if "地址" in content:
            address=content
            phone=""
        if "电话" in content:
            phone=content
            address=""
    phone_address=(phone,address)
    return phone_address
 
#获取每一页20个公司信息存储在list_rows_information
def Get_page_information(url):
    #每一页20个公司信息存储在list_rows_information里面
    list_rows_information=[]
    res=requests.get(url)
    #time.sleep(2)
    soup=bs4.BeautifulSoup(res.text,"lxml")
    #time.sleep(2)
    #综合信息
    elems=soup.select(".clist_list_content_r")
    num=len(elems)
    for i in range(num):
        try:
            #公司名称
            company_name=Get_company_name(elems,i)
            
            #主要产品
            main_product=Get_main_product(elems,i)
            
            #联系方式
            phone_address=Get_phone_address(elems,i)
            phone=phone_address[0]
            address=phone_address[1]
            
            list_rows_information.append([company_name,main_product,phone,address])
        except:
            print("error at:",i)
            continue
    return list_rows_information
 
 
  
#把一页内容写入csv文档 ,list_tableContent为二维列表[[a],[b],[c]]
def Write_table_to_csv(url):
    list_tableContent=Get_page_information(url)
    fileName=os.path.splitext(url)[0][-3:]+".csv"  #  1.csv
    '''
    fileName=os.path.splitext(url)[0][-3:]+".csv"
    fileName
    Out[27]: 'p12.csv'
    '''
    #对列表格式修改，字符串写入的格式不对
    file=open(fileName,'w',newline='')
    writer1=csv.writer(file)
    writer1.writerows(list_tableContent)
    file.close()
      
#写入所有文件
def Write_allTables_to_csvs(list_pages):
    for i in range(start_page,pages_shanghai):
        try:
            Write_table_to_csv(i)
            time.sleep(random.randint(30,31))
        except:
            print("error at:",i)
            continue
        
#step表示要采集多少次；divident表示每次采集几个
def Step(urls_list,divident):
    step=len(urls_list)/divident
    step=int(step)
    return step
    
#获取采集网页余数
def Left(urls_list):
    step=Step(urls_list,divident)
    left=len(urls_list)-step*divident
    return left
        
#采集某范围网址的公司数据
def download_range(start,end):
    urls_list_range1=list_pages[start:end]
    for url in urls_list_range1:
        try:
            Write_table_to_csv(url)
        except:
            bad_urls.append(url)
            continue
    #print("well Done")        
        
#主函数
list_pages=Get_sites(site_jiangsu,pages_jiangsu)
#采集要多少线程
step=Step(list_pages,divident)
#网页余数
left=Left(list_pages)
#生产所有csv文件,单线程采集        
#Write_allTables_to_csvs(list_pages)


downloadThreads = [] # a list of all the Thread objects
for i in range(0, len(list_pages), step): # loops 14 times, creates 14 threads
    downloadThread = threading.Thread(target=download_range, args=(i, i +step))
    downloadThreads.append(downloadThread)
    downloadThread.start()
#采集余数
download_range(-left,0)
 
# Wait for all threads to end.
for downloadThread in downloadThreads:
    downloadThread.join()
print('Done.')


'''
测试

#downloadThread = threading.Thread(target=download_range, args=(10, 12))
#downloadThread.start()

downloadThread = threading.Thread(target=download_range, args=(12, 14))
downloadThread.start()


downloadThread = threading.Thread(target=download_range, args=(14, 16))
downloadThread.start()
i=3
res=requests.get(list_pages[i])
soup=bs4.BeautifulSoup(res.text,"lxml")
elems=soup.select(".clist_list_content_r")
#联系方式
elems_contact=elems[2].select(".site_l")
content_contact=elems_contact[0].text
content_contact1=content_contact.strip("\r\n\r\n\t\r\n")
content_contact2=content_contact1.strip("\r\n")
list_content_contact=content_contact2.split("\r\n\r\n")

#有时候信息会缺失，用正则表达式筛选text内容
if len(list_content_contact)==2:
    phone=list_content_contact[0]
    address=list_content_contact[1]
if len(list_content_contact)==1:
    content=list_content_contact[0]
    if "地址" in content:
        address=content
        phone=[]
    if "电话" in content:
        phone=content
        address=[]
'''

python风控评分卡建模和风控常识

https://study.163.com/course/introduction.htm?courseId=1005214003&utm_campaign=commission&utm_source=cp-400000000398149&utm_medium=share

weixin_33958585

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
盖得化工4——多线程+余数

sklearn实战-乳腺癌细胞数据挖掘（博主亲自录制视频）https://study.163.com/course/introduction.htm?courseId=1005269003&amp;utm_campaign=commission&amp;utm_source=cp-400000000398149&amp;utm_medium=share# -*- co...
复制链接

扫一扫