仅供学习。
# coding=utf-8
import os
import requests
from lxml import etree
import re
from urllib.parse import quote
class QiChaChaSpider(object):
"""" Spider class"""
def __init__(self, kwd):
self.query_url = "https://www.qcc.com/web/search?key=%s" % quote(kwd, 'utf-8')
self.cookie_values = '__guid=84250399.122939647196311810000.1629279260275.069; UM_distinctid=17b589c949c1dc-05f3fed884762c-45410629-ffc00-17b589c949d441; _uab_collina=162927926213078340246014; QCCSESSID=7qcifltrsd50avr0ntt17enj17; acw_tc=6525b7a116294212420678146e47a3502484a9fe6b5b5b1fd8a2465651; CNZZDATA1254842228=139459761-1629275828-https%253A%252F%252Fwww.so.com%252F%7C1629417815; acw_sc__v2=611efeda37a4768e67d96b81211fd577252fc103; monitor_count=34; zg_did=%7B%22did%22%3A%20%2217b589c94ff238-0aa38fa349d694-45410629-ffc00-17b589c95002a1%22%7D; zg_294c2ba1ecc244809c552f8f6fd2a440=%7B%22sid%22%3A%201629421237682%2C%22updated%22%3A%201629421280863%2C%22info%22%3A%201629279261978%2C%22superProperty%22%3A%20%22%7B%5C%22%E5%BA%94%E7%94%A8%E5%90%8D%E7%A7%B0%5C%22%3A%20%5C%22%E4%BC%81%E6%9F%A5%E6%9F%A5%E7%BD%91%E7%AB%99%5C%22%7D%22%2C%22platform%22%3A%20%22%7B%7D%22%2C%22utm%22%3A%20%22%7B%5C%22%24utm_source%5C%22%3A%20%5C%22360%5C%22%2C%5C%22%24utm_medium%5C%22%3A%20%5C%22cpc%5C%22%2C%5C%22%24utm_term%5C%22%3A%20%5C%22%E6%9F%A5%E4%BC%81%E4%B8%9A%5C%22%7D%22%2C%22referrerDomain%22%3A%20%22www.so.com%22%2C%22cuid%22%3A%20%220aa95e2b58e2a5b75bb170765a2799a8%22%2C%22zs%22%3A%200%2C%22sc%22%3A%200%2C%22firstScreen%22%3A%201629421237682%7D'
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36",
"referer": self.query_url,
'Cookie': self.cookie_values.encode('utf-8')
}
def run(self):
""" Spider main func """
detail_url = self.get_detail_url()
if detail_url:
try:
content_dict = self.get_data(detail_url)
value_dict = {
"content": content_dict,
"status": 1
}
except:
value_dict = {
"content": "Get data error",
"status": 0
}
else:
value_dict = {
"content": "Get detail url error",
"status": 0
}
self.log(self.query_url, detail_url, value_dict)
return value_dict
@staticmethod
def log(query_url, detail_url, value_dict):
""" Logging request & response msg """
import datetime
try:
os.makedirs('./logs')
except:
pass
date = str(datetime.date.today())
with open(f"./logs/{date}.txt", 'a', encoding='utf-8') as f:
f.write('<Spider 2>' + '\n')
f.write(f'<Request : {datetime.datetime.now()} query_url : {query_url}>\n')
if detail_url:
f.write(' '*11 + 'detail_url : ' + detail_url + '\n')
f.write('<Response : ' + str(value_dict) + '\n\n')
def get_detail_url(self):
""" Get kw urls """
detail_url = None
response = requests.get(self.query_url, headers=self.headers)
element_obj = etree.HTML(response.text)
try:
detail_url = element_obj.xpath('//*[@class="maininfo"]/a/@href')[0]
except:
pass
return detail_url
def get_data(self, detail_url):
""" Get detail data """
total_data_list = []
response = requests.get(detail_url, headers=self.headers)
element_obj = etree.HTML(response.text)
# 置顶表信息
xpath_root1 = '//div[@class="contact-info"]'
# boss
try:
boss = element_obj.xpath(xpath_root1 + '/div[1]/span[1]/span/span/span/a/text()')[0]
except:
boss = ''
try:
phone = element_obj.xpath(xpath_root1 + '/div[2]/span[1]/span/span[2]/text()')[0]
total_data_list.append(phone)
except:
total_data_list.append('')
try:
address = element_obj.xpath('//*[@class="texta"]/*[1]/text()')[0]
total_data_list.append(address)
except:
total_data_list.append('')
# 主表信息
xpath_root2 = '//*[@id="cominfo"]/div[2]/table/tr'
element_list = element_obj.xpath(f'{xpath_root2}/td| {xpath_root2}/td/*[1]')
data_list = []
for i in element_list:
try:
n = i.xpath('./text()')[0]
if n == ' ':
continue
else:
data_list.append(n)
except:
continue
data_list.insert(5, boss)
for data in data_list:
if data_list.index(data) % 2 != 0:
total_data_list.append(data)
total_data_list = [re.sub(r'\s', '', d) for d in total_data_list]
key_list = [
"PHONE",
"address",
"tongyi_id",
"ENTNAME",
"LEGALPERSON", # 法定代表人
"jingying_status",
"OPFROM", # 成立时间
"REGCAP", # 注册资本
"shijiao_money",
"APPRDATE", # "核准日期"
"zuzhijigou_id",
"REGNO", # "工商注册号"
"UNCID", # "纳税人识别号"
"ENTTYPE", # "公司类型"
"yingye_date",
"UNC", # "纳税人资质"
"hangye",
"diqu",
"REGORG", # "登记机关"
"PERSON", # "人员规模"
"NOWMAN", # "参保人数"
"old_name",
"english_name",
"jinchukou_id",
"OPLOC", # "注册地址"
"OPSCOPE", # "经营范围"
]
content_dict = dict(zip(key_list, total_data_list))
return content_dict