python爬取天眼企业信息数据
ip: 使用小象代理ip
从name.txt文件中读取企业名称,爬取数据后放入data.csv中
import requests
from lxml import etree
import queue
import threading
from termcolor import cprint
import csv
import os
def new_session():
session = requests.session();
while True:
try:#https://www.qcc.com/ https://www.tianyancha.com/
session.get(url='https://www.tianyancha.com/',headers=headers,timeout=(2,2))
cprint('new session','yellow')
return session
except:
cprint('network error','yellow')
def get_html(session,url,flag):
while True:
try:
res = session.get(url=url,headers=headers,timeout=(5,5),proxies=proxies).content.decode('utf-8',errors='ignore')
cprint(res,'red')
if flag in res:
return session,res
session = new_session()
cprint( 'error response ', 'red')
except:
cprint( 'newwork timeout ', 'yellow')
def analyze_out(res, name, dic):
tree = etree.HTML(res)
cprint(tree.text,'red')
# dizhi = tree.xpath( '/html/body/div/div[2]/div/div[2]/section/main/div[2]/div[2]/div[1]/div/div[2]/div[2]/div[1]/div[1]/a/@href')
# cprint("dizhi"+dizhi,'green')
table = tree.xpath( '//table[@class="ntable ntable-list"]/tr')
if not table:
return dic
item = table[0]
res_name = ''.join(item.xpath( './/span[@class="copy-title"]/a//text()'))
if res_name != name :
return dic
dic['exist'] ='是'
dic['qccUrl'] = (item.xpath( '/html/body/div/div[2]/div/div[2]/section/main/div[2]/div[2]/div[1]/div/div[2]/div[2]/div[1]/div[1]/a/@href' ) +[''])[0]
dic['regStatus'] = (item.xpath ( './/span[@class="copy-title"]/span/text()') +[ ''])[0]
dic['estiblishTime'] = (item.xpath( ' .//span[@ciass="f" ]/span/text()')+['']*2)[1]
dic['regLocation'] = ''.join(item.xpath( './/span[@class="copy-value address-map"]//text()'))
return dic
def analyze_ins(res,dic):
tree = etree.HTML(res)
tds = tree.xpath( ' //div[@olass="cominfo-normal" ] /table/tr/td ')
i =0
for td in tds:
if '所属行业' in td.xpath(' ./text()'):
dic[ ' companyOrgType'] = (tds[i + 1].xpath( ' ./text()')+[''])[0].strip()
if '所属地区' in td.xpath( ' ./text() '):
dic[ ' province '] =(tds[i + 1].xpath(' ./text() ') +[''])[0]
i += 1
return dic
def thread_work(num):
global totalNum
session = new_session()
while True:
if nameQueue.empty():
break
name = nameQueue.get()
cprint( name, 'red')
dic = {
'companyName' : name,
'exist': '否',
'qccUrl': '',
'regstatus': '',
'estiblishTime' : '',
'regLocation' : '',
' province' : '',
'company0rgType':''
}
url = f'https://www.tianyancha.com/search?key={name}'
flag = f'<title>{name}_相关搜索结果-天眼查</title>'
# url = f'https://www.qcc.com/search?key={name}'
# flag = f'<title>{name}_相关搜索结果-天眼查</title>'
cprint('准备获取','red')
session,res = get_html(session,url, flag)
dic = analyze_out(res,name,dic)
if dic['exist'] =='是' and dic['qccUrl']:
url = dic['qccUrl']
flag = f'<title>{name} -企查查</title>'
session,res = get_html(session,url,flag)
dic = analyze_ins(res, dic)
with threading.Lock():
writer.writerow(list(dic.values()))
totalNum += 1
cprint(f'thread: {num} finishNum: {totalNum} data: {dic}' , 'green')
if __name__ == '__main__':
threadNum = 5
totalNum = 0
cprint(f' program starts with {threadNum} threads ' , 'green')
ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'
headers = { 'User-Agent' : ua,}
#小象代理
proxyMeta = 'http://%(user)s:%(pass)s@%(host)s:%(port)s' % {
'host' : 'http-short.xiaoxiangdaili.com',
'port' : 10010,
'user': '1006461268607258624',
'pass' : 'RMnMPVHr'
}
proxies = {
'http': proxyMeta,
'https': proxyMeta
}
name_list = [name.strip() for name in open(r'C:\Users\Xiao\Desktop\qicc_news-master\src\企业查询\\name.txt', encoding='utf-8').readlines() if name.strip()]
cprint(name_list,'red')
if not os.path.exists( 'data.csv ' ):
open( 'data.csv', 'w', encoding= 'utf-8-sig' ).write(
'companyName , exist , qccUrl,negStatus,estiblishTime , negLocation , province , companyOrgType\n')
writer = csv.writer(open('data.csv','a', encoding='utf-8-sig',newline=''))
nameQueue = queue.Queue()
for name in name_list:
nameQueue.put(name)
threadList = []
for n in range(threadNum):
t = threading.Thread(target=thread_work,args=(n + 1,))
threadList.append(t)
t.start()
for t in threadList:
t.join()
cprint( 'proqram finishs', 'green')