获取云商网企业名录,包括名称联系以及主营业务
不多说,直接上代码,呵呵,其他问题在下方提示:
import requests
from bs4 import BeautifulSoup
import urllib
import chardet
import time
import os
import pandas as pd
import threading
import random
host_ ="http://3g.ynshangji.com"
cat_url="/shen-huangye/"
num_threads=8
task_work_num=10
#重新编码
def reencoding(req,url):
try:
with urllib.request.urlopen(url) as response1:
raw_data = response1.read()
c_set = chardet.detect(raw_data)
encoding = c_set['encoding']
req.encoding = encoding
except urllib.error.HTTPError as e:
print(e.code)
except urllib.error.URLError as e:
print(e.reason)
return req.text
#tables 是html表格获取的数据,
# company_prefix公司名称(type:string)
# comp_name(type:bs4.element.Tag)
def html2pd(tables,pd,company_prefix=None,comp_name=None):
df_list =[]
df_result= None
for table in tables:
a = table.prettify()
#print(a)
b = pd.read_html(a)
#b[0].append({company_prefix,comp_name.string},ignore_index=True)
add_d=pd.DataFrame([(company_prefix,comp_name.string)]) # adding a row
b_temp=b[0]
real_b=pd.concat([add_d,b_temp],axis=0)
#print(type(real_b))
#b_dataframe.append(add_d,ignore_index=True)
#print(b)
#df = pd.concat(b)
#print(type(df))
#print(df)#b is list ,b[0] is dataframe
#df.index = df.index + 1 # shifting index
#df = df.sort_index() # sorting by index
df_list.append(real_b)
if(len(df_list) == 0):
return df_result
else:
df_result = pd.concat(df_list)
return df_result
#当前页面公司(列表)并且写到文件
def find_info_from_html_and2excel(ul,writer,index):
current_index_row = index
get_index = 0
company_prefix="公司名称:"
for companyname in ul.find_all('li'):
comp_name=companyname.find('div')
#打印公司名字
sub_url = ''.join([a['href'] for a in companyname.find_all('a')])
new_url = host_ + sub_url#详细公司html
sub_url_req = requests.post(new_url)#访问
if(sub_url_req.status_code == requests.codes.ok):
req_text = reencoding(sub_url_req,new_url)
#print(req_text)#text
#打印详细公司
comany_detail_soup = BeautifulSoup(req_text,'html.parser')
#ul = comany_detail_soup.find('table',attrs={'border':'0'})
#ul2 = comany_detail_soup.find('div',attrs={'class':'Company-ion'})
tables = comany_detail_soup.select('table')
df=html2pd(tables,pd,company_prefix,comp_name)
if(df is None):
#print('df')#应当标记当前html
continue
else:
df.to_excel(writer,startrow=current_index_row,sheet_name='aa',index=False)
current_index_row += (df.shape[0]+1) #获取原数据的行数
get_index = current_index_row
pass
else:
continue
return get_index
def total(index_num):
total_index_num = int(index_num)
begin_index =((total_index_num-1)*task_work_num)+1
end_index = total_index_num* task_work_num
xlsx_name = 'company_total' +'_'+ str(total_index_num)+'.xlsx'
writer = pd.ExcelWriter(os.path.join(os.getcwd(), xlsx_name))
current_index_row = 0
for index in range(begin_index, end_index+1):
url = host_ + cat_url + str(index)
info = requests.post(url)
time.sleep(random.randint(3,6))
if(info.status_code == requests.codes.ok):
soup= BeautifulSoup(info.content,'html.parser')
ul = soup.find('ul',attrs={'class':'indSubCateUL'})
index_num = find_info_from_html_and2excel(ul,writer,current_index_row)
print(url+ ' 访问成功;and: '+ str(index_num) )
time.sleep(random.randint(5,10))
if(index_num is None):
print('None')
continue
else:
current_index_row = index_num
else:
continue
writer.save()
writer.close()
t_1 = threading.Thread(target=total,name = 't_1',args='1')
t_1.start()
t_2 = threading.Thread(target=total,name = 't_2',args='2')
t_2.start()
t_3 = threading.Thread(target=total,name = 't_3',args='3')
t_3.start()
t_4 = threading.Thread(target=total,name = 't_4',args='4')
t_4.start()
t_5 = threading.Thread(target=total,name = 't_5',args='5')
t_5.start()
t_6 = threading.Thread(target=total,name = 't_6',args='6')
t_6.start()
t_7 = threading.Thread(target=total,name = 't_7',args='7')
t_7.start()
t_8 = threading.Thread(target=total,name = 't_8',args='8')
t_8.start()
注意:1.本文获取到的数据是保存到了8个xlsx文件中(因为是线程的个数)
2.在获取的过程中,遇到一些问题,就是ip会被限制,单线程的情况下(运行一次代码可获取20-36页面的数据),8线程(每个线程10-15个页面),ip限制的问题,只能通过重启路由,或者代理功能来解决了.测试用延迟的方式,貌似是行不通的。