用py获取企业名录

最新推荐文章于 2024-05-23 09:51:11 发布

Teleger

最新推荐文章于 2024-05-23 09:51:11 发布

阅读量421

点赞数

分类专栏： python

本文链接：https://blog.csdn.net/teleger/article/details/108572213

版权

python 专栏收录该内容

22 篇文章 0 订阅

订阅专栏

获取云商网企业名录，包括名称联系以及主营业务

不多说，直接上代码，呵呵，其他问题在下方提示:

import requests
from bs4 import BeautifulSoup
import urllib
import chardet
import time
import os
import pandas as pd
import threading

import random

host_ ="http://3g.ynshangji.com"
cat_url="/shen-huangye/"


num_threads=8
task_work_num=10


#重新编码
def reencoding(req,url):
    try:
        with urllib.request.urlopen(url) as response1:
            raw_data = response1.read()
            c_set = chardet.detect(raw_data)
            encoding = c_set['encoding']
            req.encoding = encoding
    except urllib.error.HTTPError as e:
        print(e.code)
    except urllib.error.URLError as e:
        print(e.reason)
    return req.text

#tables　是html表格获取的数据,
# company_prefix公司名称(type:string)
# comp_name(type:bs4.element.Tag)
def html2pd(tables,pd,company_prefix=None,comp_name=None):
    df_list =[]
    df_result= None

    for table in tables:
        a = table.prettify()
        #print(a)
        b = pd.read_html(a)
        #b[0].append({company_prefix,comp_name.string},ignore_index=True)
        add_d=pd.DataFrame([(company_prefix,comp_name.string)])  # adding a row
        b_temp=b[0]

        real_b=pd.concat([add_d,b_temp],axis=0)
        #print(type(real_b))
        #b_dataframe.append(add_d,ignore_index=True)
        #print(b)

        #df = pd.concat(b)
        #print(type(df))

        #print(df)#b is list ,b[0] is dataframe
        #df.index = df.index + 1  # shifting index
        #df = df.sort_index()  # sorting by index
        df_list.append(real_b)

    if(len(df_list) == 0):
        return df_result
    else:
        df_result = pd.concat(df_list)
        return df_result



#当前页面公司(列表)并且写到文件
def find_info_from_html_and2excel(ul,writer,index):
    current_index_row = index
    get_index = 0
    company_prefix="公司名称:"
    for companyname in ul.find_all('li'):
        comp_name=companyname.find('div')
        #打印公司名字
        sub_url  = ''.join([a['href'] for a in companyname.find_all('a')])
        new_url = host_ + sub_url#详细公司html
        sub_url_req = requests.post(new_url)#访问
        if(sub_url_req.status_code == requests.codes.ok):
            req_text = reencoding(sub_url_req,new_url)
            #print(req_text)#text
            #打印详细公司
            comany_detail_soup = BeautifulSoup(req_text,'html.parser')
            #ul = comany_detail_soup.find('table',attrs={'border':'0'})
            #ul2 = comany_detail_soup.find('div',attrs={'class':'Company-ion'})
            tables = comany_detail_soup.select('table')
            df=html2pd(tables,pd,company_prefix,comp_name)
            if(df is None):
                #print('df')#应当标记当前html
                continue
            else:
                df.to_excel(writer,startrow=current_index_row,sheet_name='aa',index=False)
                current_index_row += (df.shape[0]+1) #获取原数据的行数
                get_index = current_index_row
                pass
        else:
            continue
    return get_index



def total(index_num):
    total_index_num = int(index_num)
    begin_index =((total_index_num-1)*task_work_num)+1
    end_index = total_index_num* task_work_num
    xlsx_name = 'company_total' +'_'+ str(total_index_num)+'.xlsx'
    writer = pd.ExcelWriter(os.path.join(os.getcwd(), xlsx_name))

    current_index_row = 0
    for index in range(begin_index, end_index+1):
        url = host_ + cat_url + str(index)
        info = requests.post(url)
        time.sleep(random.randint(3,6))
        if(info.status_code == requests.codes.ok):
            soup= BeautifulSoup(info.content,'html.parser')
            ul = soup.find('ul',attrs={'class':'indSubCateUL'})
            index_num = find_info_from_html_and2excel(ul,writer,current_index_row)
            print(url+ '　访问成功;and:  '+ str(index_num) )
            time.sleep(random.randint(5,10))
            if(index_num is None):
                print('None')
                continue
            else:
                current_index_row = index_num
        else:
            continue
    writer.save()
    writer.close()



t_1 = threading.Thread(target=total,name = 't_1',args='1')
t_1.start()
t_2 = threading.Thread(target=total,name = 't_2',args='2')
t_2.start()
t_3 = threading.Thread(target=total,name = 't_3',args='3')
t_3.start()
t_4 = threading.Thread(target=total,name = 't_4',args='4')
t_4.start()
t_5 = threading.Thread(target=total,name = 't_5',args='5')
t_5.start()
t_6 = threading.Thread(target=total,name = 't_6',args='6')
t_6.start()
t_7 = threading.Thread(target=total,name = 't_7',args='7')
t_7.start()
t_8 = threading.Thread(target=total,name = 't_8',args='8')
t_8.start()

注意：1.本文获取到的数据是保存到了8个xlsx文件中(因为是线程的个数)

２．在获取的过程中，遇到一些问题，就是ip会被限制,单线程的情况下(运行一次代码可获取20-36页面的数据),8线程(每个线程10-15个页面),ip限制的问题，只能通过重启路由，或者代理功能来解决了.测试用延迟的方式，貌似是行不通的。

Teleger

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
打赏
0
评论
用py获取企业名录

获取云商网企业名录，包括名称联系以及主营业务不多说，直接上代码，呵呵，其他问题在下方提示:import requestsfrom bs4 import BeautifulSoupimport urllibimport chardetimport timeimport osimport pandas as pdimport threadingimport randomhost_ ="http://3g.ynshangji.com"cat_url="/shen-huangye/
复制链接

扫一扫