qcc信息爬虫获取(亲测有效)

废话不多说,直接上主题:
如果说有什么问题的话,可能就是文字与URL的转换区别

userAgent = [
“Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:72.0) Gecko/20100101 Firefox/72.0”,
“User-Agent:Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11Opera 11.11”,
“User-Agent:Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11”,
“Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1”,
“Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36”,
“Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11 “,
“User-Agent: Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)”,
“User-Agent: Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)”
]
all_company = {}
from urllib.parse import quote
def qcc(company):
“””
返回工商企查查内企业的联系信息以及工商信息
:param company: 公司名称
:return:
“””
url = ‘https://www.qcc.com/search?’
param = {
‘key’: str(company)
}
headersq = {
‘accept’: ‘text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,/;q=0.8,application/signed-exchange;v=b3;q=0.9’,
‘accept-encoding’: ‘gzip, deflate, br’,
‘accept-language’: ‘zh-CN,zh;q=0.9’,
‘cookie’: ‘QCCSESSID=5ga2utmd682r6a6mki6v2i7p10; UM_distinctid=1753fc26d09412-0de8e6260c0ed5-333376b-1fa400-1753fc26d0a595; zg_did=%7B%22did%22%3A%20%221753fc26d33c07-0b79131f89a78f-333376b-1fa400-1753fc26d37aa2%22%7D; _uab_collina=160309250808754166880055; CNZZDATA1254842228=584715467-1603088268-https%253A%252F%252Fwww.baidu.com%252F%7C1603849732; acw_tc=6f7b369616038528233648657e7222d95506ccf03ab6e1c5f949c05f4d; zg_de1d1a35bfa24ce29bbf2c7eb17e6c4f=%7B%22sid%22%3A%201603852824351%2C%22updated%22%3A%20{}%2C%22info%22%3A%201603697354566%2C%22superProperty%22%3A%20%22%7B%7D%22%2C%22platform%22%3A%20%22%7B%7D%22%2C%22utm%22%3A%20%22%7B%7D%22%2C%22referrerDomain%22%3A%20%22www.qcc.com%22%2C%22cuid%22%3A%20%2201dbc455e009206a08ae9edf9ece6d95%22%2C%22zs%22%3A%200%2C%22sc%22%3A%200%7D’.format(str(int(time.time()1000))),
‘referer’: ‘https://www.qcc.com/search?key={}’.format(quote(company)),
‘sec-fetch-dest’: ‘document’,
‘sec-fetch-mode’: ‘navigate’,
‘sec-fetch-site’: ‘same-origin’,
‘sec-fetch-user’: ‘?1’,
‘upgrade-insecure-requests’: ‘1’,
‘user-agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36’,
}
while True:
time.sleep(random.random() * 10)
try:
con = requests.get(url=url, params=param, headers=headersq, proxies=get_ip(),
verify=True) # 获取的是企查查搜索出来的列表信息
print(‘企查查的信息获取状态码:’, con.status_code)
if con.status_code == 200:
break
else:
time.sleep(random.random() * 10)
print(‘ip过期,更换!!!’)
new_ip = str(eval(open(’./qcc_ip_list.txt’).read())[1:])
with open(’./qcc_ip_list.txt’, ‘w’) as f:
f.write(new_ip)
continue
except Exception as e:
print(‘错误:’, e)
print(‘ip过期,更换!!!’)
time.sleep(random.random() * 10)
new_ip = str(eval(open(’./qcc_ip_list.txt’).read())[1:])
with open(’./qcc_ip_list.txt’, ‘w’) as f:
f.write(new_ip)
continue
con = etree.HTML(con.text)
con_list = con.xpath(’//table[@class=“m_srchList”]//tr’)
if len(con_list) == 0:
print(‘长度空,无对应信息’)
return [’’, ‘’, ‘’, ‘’]
print(‘长度:’, len(con_list),‘可以进行数据列表的获取!’)
for i in con_list:
print(‘进入企查查获取信息!!!’)
print(‘企业名称:’, re.findall(‘addSearchIndex(’(.
?)’’, i.xpath(’.//a[@class=“ma_h1”]/@onclick’)[0])[0])
if i.xpath(’./@class’)[0] == ‘frtrt ’ or company ==
re.findall(‘addSearchIndex(’(.?)’’, i.xpath(’.//a[@class=“ma_h1”]/@onclick’)[0])[0]:
con_url = ‘https://www.qcc.com’ + i.xpath(’.//a[@class=“ma_h1”]/@href’)[0]
time.sleep(random.random() * 10)
headers = {
‘accept-encoding’: ‘gzip, deflate, br’,
‘accept-language’: ‘zh-CN,zh;q=0.9’,
‘cache-control’: ‘max-age=0’,
‘cookie’: ‘QCCSESSID=5ga2utmd682r6a6mki6v2i7p10; UM_distinctid=1753fc26d09412-0de8e6260c0ed5-333376b-1fa400-1753fc26d0a595; zg_did=%7B%22did%22%3A%20%221753fc26d33c07-0b79131f89a78f-333376b-1fa400-1753fc26d37aa2%22%7D; _uab_collina=160309250808754166880055; CNZZDATA1254842228=584715467-1603088268-https%253A%252F%252Fwww.baidu.com%252F%7C1603849732; acw_tc=6f7b369616038528233648657e7222d95506ccf03ab6e1c5f949c05f4d; zg_de1d1a35bfa24ce29bbf2c7eb17e6c4f=%7B%22sid%22%3A%201603852824351%2C%22updated%22%3A%20{}%2C%22info%22%3A%201603697354566%2C%22superProperty%22%3A%20%22%7B%7D%22%2C%22platform%22%3A%20%22%7B%7D%22%2C%22utm%22%3A%20%22%7B%7D%22%2C%22referrerDomain%22%3A%20%22www.qcc.com%22%2C%22cuid%22%3A%20%2201dbc455e009206a08ae9edf9ece6d95%22%2C%22zs%22%3A%200%2C%22sc%22%3A%200%7D’.format(str(int(time.time()1000))),
‘referer’: ‘https://www.qcc.com/search?key={}’.format(quote(company)),
‘sec-fetch-dest’: ‘document’,
‘sec-fetch-mode’: ‘navigate’,
‘sec-fetch-site’: ‘same-origin’,
‘sec-fetch-user’: ‘?1’,
‘upgrade-insecure-requests’: ‘1’,
‘user-agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36’,
}
while True:
try:
detail = requests.get(url=con_url, headers=headers, proxies=get_ip())
details = etree.HTML(detail.text)
if len(details.xpath(’//section[@id=“Cominfo”]/table//tr[3]/td[2]//text()’)) == 0:
print(details.xpath(’//section[@id=“Cominfo”]/table//tr[3]/td[2]//text()’))
print(‘信息空,从新获取’)
time.sleep(random.random() * 20)
new_ip = str(eval(open(’./qcc_ip_list.txt’).read())[1:])
with open(’./qcc_ip_list.txt’, ‘w’) as f:
f.write(new_ip)
continue
break
except:
print(‘详细内容获取,ip过期,更换!!!’)
time.sleep(random.random() * 20)
new_ip = str(eval(open(’./qcc_ip_list.txt’).read())[1:])
with open(’./qcc_ip_list.txt’, ‘w’) as f:
f.write(new_ip)
continue
print(detail)
try:
synopsis = (
details.xpath(’//
[@id=“company-top”]/div[2]/div[2]/div[3]/div[3]/span[3]//text()’)[0]) # 简介
except:
try:
synopsis = details.xpath(’//
[@id=“company-top”]/div[2]/div[2]/div[3]/div[3]/span[2]/text()’)[0]
except:
print(‘内容:’, detail.text)
print(‘synopsis错误:’,
details.xpath(’//[@id=“company-top”]/div[2]/div[2]/div[3]/div[3]/span[2]/text()’))
synopsis = ‘’
try:
Credit_Code = details.xpath(’//section[@id=“Cominfo”]/table//tr[3]/td[2]//text()’)[0].strip() # 社会信用代码
except:
print(‘Credit_Code错误:’, details.xpath(’//section[@id=“Cominfo”]/table//tr[3]/td[2]//text()’))
Credit_Code = ‘’
try:
business_scope = details.xpath(’//section[@id=“Cominfo”]/table//tr[9]/td[2]//text()’)[0].strip() # 经营范围
except:
print(‘business_scope错误:’, details.xpath(’//section[@id=“Cominfo”]/table//tr[9]/td[2]//text()’))
business_scope = ‘’
try:
master = details.xpath(’//
[@id=“Cominfo”]/table//tr[1]/td[2]/div/div/div[2]/a[1]/h2//text()’)[0] # 负责人
except:
print(‘master错误:’,
details.xpath(’//*[@id=“Cominfo”]/table//tr[1]/td[2]/div/div/div[2]/a[1]/h2//text()’))
master = ‘’
end_list = [master, synopsis, Credit_Code, business_scope]
print(‘基本信息:’, end_list)
all_company[company] = end_list
return end_list
print(‘企查查没有找到对应的公司!’)
return [’’, ‘’, ‘’, ‘’]

©️2020 CSDN 皮肤主题: 1024 设计师:上身试试 返回首页