仅供学习。
# coding=utf-8
import requests
from lxml import etree
import re
import datetime
class TYChaSpider(object):
"""" Spider class"""
def __init__(self, kw):
self.kw = kw
self.cookie_values = 'TYCID=28646c20d32611eb8874e3b09d7892f1; ssuid=1017598887; _ga=GA1.2.1910279179.1624344673; tyc-user-phone=%255B%252213753949664%2522%252C%2522150%25200122%25203280%2522%255D; __insp_slim=1624685877519; __insp_wid=677961980; __insp_nv=true; __insp_targlpt=5LyB5Lia6K6k6K_BIC0g5aSp55y85p_l; __insp_targlpu=aHR0cHM6Ly93d3cudGlhbnlhbmNoYS5jb20vY2xhaW0vZW50cnkvNTAwNzc4MTcwNz9mcm9tPWYz; __insp_norec_sess=true; creditGuide=1; jsid=https%3A%2F%2Fwww.tianyancha.com%2F%3Fjsid%3DSEM-BAIDU-PZ-SY-2021112-JRGW; _gid=GA1.2.1539224761.1629093801; RTYCID=32d01989fd8a4ca698197f5c702ee5f1; CT_TYCID=d325e96b83574e45b48889072d9d5503; bdHomeCount=4; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2213753949664%22%2C%22first_id%22%3A%2217a327c40e88fc-09869a0570e7d5-434b092e-1049088-17a327c40e9729%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%7D%2C%22%24device_id%22%3A%2217a327c40e88fc-09869a0570e7d5-434b092e-1049088-17a327c40e9729%22%7D; searchSessionId=1629167251.56857563; bannerHide=notlogin; aliyungf_tc=936e6e0388affe0b1a870a913e9e5b6343750d60c7fd7b0c6f638b9b3a0e602e; acw_tc=76b20f4e16291869673895773ea54cd81b8debae27f7d9992b1ce8639df95e; csrfToken=dBfQPe67Y2LvDqP0ilUmgm_b; Hm_lvt_e92c8d65d92d534b0fc290df538b4758=1629093801,1629107091,1629166693,1629186968; tyc-user-info={%22state%22:%220%22%2C%22vipManager%22:%220%22%2C%22mobile%22:%2213753949664%22}; auth_token=eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxMzc1Mzk0OTY2NCIsImlhdCI6MTYyOTE4NzAxNCwiZXhwIjoxNjYwNzIzMDE0fQ.Ru_W33VI5wCIaLz75H2Llp70JLYRD6yiQy4Y5fs-F6jSwZXffRarK8NRjTxLbEFbXCyLWeXSyvLpivEJy92vcQ; tyc-user-info-save-time=1629187012389; bannerFlag=true; Hm_lpvt_e92c8d65d92d534b0fc290df538b4758=1629187407; _gat_gtag_UA_123487620_1=1; cloud_token=d8a3687095b448ff80f2000e1ad88016; cloud_utm=f7b2b6e1148442fa97eea234acf70830'
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36",
"referer": "https://www.tianyancha.com/",
'Host': 'www.tianyancha.com',
'Cookie': self.cookie_values.encode('utf-8')
}
def run(self):
""" Spider main func """
url_list = self.get_urls()
if url_list:
try:
content_dict = self.get_data(url_list[0])
value_dict = {
"content": content_dict,
"status": 1
}
except:
value_dict = {
"content": "Get data error",
"status": 0
}
else:
value_dict = {
"content": "Get url error",
"status": 0
}
self.log(self.kw, value_dict, url_list)
return value_dict
@staticmethod
def log(kw, value_dict, url_list):
""" Logging request & response msg """
date = str(datetime.date.today())
with open(f"./logs/{date}.txt", 'a', encoding='utf-8') as f:
f.write('<Spider 1>' + '\n')
f.write(f'<Request : {datetime.datetime.now()} kwd : {kw}>\n')
if url_list:
f.write(' '*11 + url_list[0] + '\n')
f.write('<Response : ' + str(value_dict) + '\n\n')
def get_urls(self):
""" Get kw urls """
url_list = []
try:
url = "https://www.tianyancha.com/search?key=%s" % self.kw
response = requests.get(url, headers=self.headers)
element_obj = etree.HTML(response.text)
try:
url_list.append(element_obj.xpath('//*[@id="search_company_0"]/div/div[3]/div/a/@href')[0])
except:
url_list.append(element_obj.xpath('//*[@id="search_company_0"]/div/div[4]/div/a/@href')[0])
except:
pass
return url_list
def get_data(self, url):
""" Get kw data """
response = requests.get(url, headers=self.headers, verify=True)
element_obj = etree.HTML(response.text)
data_list = []
############## 置顶信息
topinfo_path = '//*[@id="company_web_top"]/div[3]/div[3]'
# 公司名称
data_list.append(element_obj.xpath(topinfo_path + '/div[1]/h1/text()')[0])
# 电话
try:
try:
data_list.append(element_obj.xpath(topinfo_path + '/div[3]/div[1]/div[1]/span[4]/text()')[0])
except:
data_list.append(element_obj.xpath(topinfo_path + '/div[3]/div[2]/div[1]/span[4]/text()')[0])
except:
data_list.append('')
# 网址
try:
try:
data_list.append(element_obj.xpath(topinfo_path + '/div[3]/div[2]/div[1]/span[2]/text()')[0])
except:
data_list.append(element_obj.xpath(topinfo_path + '/div[3]/div[2]/div[1]/a[2]/@href')[0])
except:
data_list.append('')
# 地址
try:
try:
data_list.append(element_obj.xpath(topinfo_path + '/div[3]/div[2]/div[2]/div/div/text()')[0])
except:
data_list.append(element_obj.xpath(topinfo_path + '/div[3]/div[2]/div[2]/span[3]/text()')[0])
except:
data_list.append('')
############### 详细信息
is_oldpage = False
baseinfo_path = '//*[@id="_container_baseInfo"]/table/tbody'
# 法定人
try:
data_list.append(element_obj.xpath(baseinfo_path + '/tr[1]/td[2]/div/div[1]/div[2]/div[1]/a/text()')[0])
except:
is_oldpage = True
# 新页面##################################################################################################################################
if not is_oldpage:
# 获取拥有公司
have_company = element_obj.xpath(baseinfo_path + '/tr[1]/td[2]/div/div[2]/div/div[2]/span[1]/text()')
data_list.append('、'.join(have_company)) if len(have_company) > 0 else data_list.append("暂无")
# 第一列数据
for i in range(2, 12):
if i == 3: # 注册资本
data_list.append(element_obj.xpath(baseinfo_path + '/tr[%s]/td[2]/div/text()' % i)[0])
continue
try:
data_list.append(element_obj.xpath(baseinfo_path + '/tr[%s]/td[2]/text()' % i)[0])
except IndexError:
data_list.append(element_obj.xpath(baseinfo_path + '/tr[%s]/td[2]/span/text()' % i)[0])
# 第二列数据
for i in range(1, 12):
try:
data_list.append(element_obj.xpath(baseinfo_path + '/tr[%s]/td[4]/text()' % i)[0])
except IndexError:
continue
# 第三列数据
for i in range(5, 8):
data_list.append(element_obj.xpath(baseinfo_path + '/tr[%s]/td[6]/text()' % i)[0])
#
# name_list = [
# "公司名称",
# "电话",
# "网址",
# "邮箱",
# "地址",
# "法定代表人",
# "任职企业",
# "成立时间",
# "注册资本",
# "实缴资本",
# "统一社会信用代码",
# "营业期限",
# "公司类型",
# "参保人数",
# "曾用名",
# "注册地址",
# "经营范围",
# "经营状态",
# "工商注册号",
# "纳税人识别号",
# "纳税人资质",
# "行业",
# "登记机关",
# "英文名称",
# "组织机构代码",
# "核准日期",
# "人员规模"
# ]
name_list = [
"ENTNAME",
"PHONE",
"diqu",
"address",
"LEGALPERSON",
"ALLENT",
"OPFROM",
"REGCAP",
"shijiao_money",
"tongyi_id",
"yingye_date",
"ENTTYPE",
"NOWMAN",
"old_name",
"OPLOC",
"OPSCOPE",
"jingying_status",
"REGNO",
"UNCID",
"UNC",
"hangye",
"REGORG",
"english_name",
"zuzhijigou_id",
"APPRDATE",
"PERSON",
]
# 营业期限更改
data_list[11] = "".join(re.findall('\S', data_list[11]))
data_dict = dict(zip(name_list, data_list))
# 旧页面##################################################################################################################################
else:
oldpage_info_path = '//*[@id="_container_baseInfo"]/table/tbody'
keys_list = element_obj.xpath(oldpage_info_path + '/tr/td/text()')
values_list = element_obj.xpath(oldpage_info_path + '/tr/td/span/text()')
data_dict = dict(zip(keys_list, values_list))
return data_dict