用py获取企业名录

获取云商网企业名录,包括名称联系以及主营业务

不多说,直接上代码,呵呵,其他问题在下方提示:

import requests
from bs4 import BeautifulSoup
import urllib
import chardet
import time
import os
import pandas as pd
import threading

import random

host_ ="http://3g.ynshangji.com"
cat_url="/shen-huangye/"


num_threads=8
task_work_num=10


#重新编码
def reencoding(req,url):
    try:
        with urllib.request.urlopen(url) as response1:
            raw_data = response1.read()
            c_set = chardet.detect(raw_data)
            encoding = c_set['encoding']
            req.encoding = encoding
    except urllib.error.HTTPError as e:
        print(e.code)
    except urllib.error.URLError as e:
        print(e.reason)
    return req.text

#tables 是html表格获取的数据,
# company_prefix公司名称(type:string)
# comp_name(type:bs4.element.Tag)
def html2pd(tables,pd,company_prefix=None,comp_name=None):
    df_list =[]
    df_result= None

    for table in tables:
        a = table.prettify()
        #print(a)
        b = pd.read_html(a)
        #b[0].append({company_prefix,comp_name.string},ignore_index=True)
        add_d=pd.DataFrame([(company_prefix,comp_name.string)])  # adding a row
        b_temp=b[0]

        real_b=pd.concat([add_d,b_temp],axis=0)
        #print(type(real_b))
        #b_dataframe.append(add_d,ignore_index=True)
        #print(b)

        #df = pd.concat(b)
        #print(type(df))

        #print(df)#b is list ,b[0] is dataframe
        #df.index = df.index + 1  # shifting index
        #df = df.sort_index()  # sorting by index
        df_list.append(real_b)

    if(len(df_list) == 0):
        return df_result
    else:
        df_result = pd.concat(df_list)
        return df_result



#当前页面公司(列表)并且写到文件
def find_info_from_html_and2excel(ul,writer,index):
    current_index_row = index
    get_index = 0
    company_prefix="公司名称:"
    for companyname in ul.find_all('li'):
        comp_name=companyname.find('div')
        #打印公司名字
        sub_url  = ''.join([a['href'] for a in companyname.find_all('a')])
        new_url = host_ + sub_url#详细公司html
        sub_url_req = requests.post(new_url)#访问
        if(sub_url_req.status_code == requests.codes.ok):
            req_text = reencoding(sub_url_req,new_url)
            #print(req_text)#text
            #打印详细公司
            comany_detail_soup = BeautifulSoup(req_text,'html.parser')
            #ul = comany_detail_soup.find('table',attrs={'border':'0'})
            #ul2 = comany_detail_soup.find('div',attrs={'class':'Company-ion'})
            tables = comany_detail_soup.select('table')
            df=html2pd(tables,pd,company_prefix,comp_name)
            if(df is None):
                #print('df')#应当标记当前html
                continue
            else:
                df.to_excel(writer,startrow=current_index_row,sheet_name='aa',index=False)
                current_index_row += (df.shape[0]+1) #获取原数据的行数
                get_index = current_index_row
                pass
        else:
            continue
    return get_index



def total(index_num):
    total_index_num = int(index_num)
    begin_index =((total_index_num-1)*task_work_num)+1
    end_index = total_index_num* task_work_num
    xlsx_name = 'company_total' +'_'+ str(total_index_num)+'.xlsx'
    writer = pd.ExcelWriter(os.path.join(os.getcwd(), xlsx_name))

    current_index_row = 0
    for index in range(begin_index, end_index+1):
        url = host_ + cat_url + str(index)
        info = requests.post(url)
        time.sleep(random.randint(3,6))
        if(info.status_code == requests.codes.ok):
            soup= BeautifulSoup(info.content,'html.parser')
            ul = soup.find('ul',attrs={'class':'indSubCateUL'})
            index_num = find_info_from_html_and2excel(ul,writer,current_index_row)
            print(url+ ' 访问成功;and:  '+ str(index_num) )
            time.sleep(random.randint(5,10))
            if(index_num is None):
                print('None')
                continue
            else:
                current_index_row = index_num
        else:
            continue
    writer.save()
    writer.close()



t_1 = threading.Thread(target=total,name = 't_1',args='1')
t_1.start()
t_2 = threading.Thread(target=total,name = 't_2',args='2')
t_2.start()
t_3 = threading.Thread(target=total,name = 't_3',args='3')
t_3.start()
t_4 = threading.Thread(target=total,name = 't_4',args='4')
t_4.start()
t_5 = threading.Thread(target=total,name = 't_5',args='5')
t_5.start()
t_6 = threading.Thread(target=total,name = 't_6',args='6')
t_6.start()
t_7 = threading.Thread(target=total,name = 't_7',args='7')
t_7.start()
t_8 = threading.Thread(target=total,name = 't_8',args='8')
t_8.start()

注意:1.本文获取到的数据是保存到了8个xlsx文件中(因为是线程的个数)

2.在获取的过程中,遇到一些问题,就是ip会被限制,单线程的情况下(运行一次代码可获取20-36页面的数据),8线程(每个线程10-15个页面),ip限制的问题,只能通过重启路由,或者代理功能来解决了.测试用延迟的方式,貌似是行不通的。

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

Teleger

你的支持是我前进的方向

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值