[Python] 纯文本查看 复制代码#-*- coding:utf-8 -*-
import requests
from lxml import etree
import json
import time
import random
from copy import deepcopy
class Tianyan():
def __init__(self):
self.url = 'https://www.tianyancha.com/search/ola3/p{}?key=%E6%B2%A7%E5%8E%BF%E5%A1%91%E6%96%99'
self.headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36",
'Cookie':'TYCID=ac1dfec0298b11e896d65952088acd0e; undefined=ac1dfec0298b11e896d65952088acd0e; ssuid=5431237103; RTYCID=a9b338e6798d4eb39fef9257fd6b9b9d; aliyungf_tc=AQAAAMBzHiKiTwgAqo/Y3f5KVHsxjcZG; csrfToken=oqv83ZlWDQkY1v32arJAja4V; jsid=SEM-BAIDU-PP-SY-000214; Hm_lvt_e92c8d65d92d534b0fc290df538b4758=1522481067,1522487432,1522586369,1522586370; bannerFlag=true; tyc-user-info=%257B%2522token%2522%253A%2522eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxNTUzMDQ0OTM4OSIsImlhdCI6MTUyMjU4NjcxMywiZXhwIjoxNTM4MTM4NzEzfQ.lvI-NEDnqyN7eN_V4FFvMnsmf_2S8LvEr79r3xVutqXuIJ1F4VAkQk9DXasWiE9eC2dKGUsBG7ZyHSJpuuq-iw%2522%252C%2522integrity%2522%253A%25220%2525%2522%252C%2522state%2522%253A%25220%2522%252C%2522redPoint%2522%253A%25220%2522%252C%2522vipManager%2522%253A%25220%2522%252C%2522vnum%2522%253A%25220%2522%252C%2522onum%2522%253A%25220%2522%252C%2522mobile%2522%253A%252215530449389%2522%257D; auth_token=eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxNTUzMDQ0OTM4OSIsImlhdCI6MTUyMjU4NjcxMywiZXhwIjoxNTM4MTM4NzEzfQ.lvI-NEDnqyN7eN_V4FFvMnsmf_2S8LvEr79r3xVutqXuIJ1F4VAkQk9DXasWiE9eC2dKGUsBG7ZyHSJpuuq-iw; Hm_lpvt_e92c8d65d92d534b0fc290df538b4758=1522586767'}
def get_url_list(self):
url_list = []
for i in range(1,6):
url = self.url.format(i)
url_list.append(url)
return url_list
def parse_url(self,url):
resp = requests.get(url,headers=self.headers)
response = resp.content.decode()
resp_html = etree.HTML(response)
div_list = resp_html.xpath("//div[@class='b-c-white search_result_container']/div")
url_content_list = []
for div in div_list:
item = {}
id = div.xpath('./@data-id')[0]
item['注册时间'] = div.xpath(".//div[@class='title overflow-width'][3]/span/text()")[0]
item['url_content'] = 'https://www.tianyancha.com/company/'+ id
url_content_list.append(item)
return url_content_list
def parse_content(self,url_content_list):
content_list = []
for item in url_content_list:
url = item['url_content']
resp = requests.get(url,headers=self.headers)
response = resp.content.decode()
resp_html = etree.HTML(response)
item['公司名字'] = resp_html.xpath("//span[@class='f18 in-block vertival-middle sec-c2']/text()")
item['地址'] = resp_html.xpath("//span[@class='in-block overflow-width vertical-top']/text()")
item['经营范围'] = resp_html.xpath("//span[@class='js-full-container ']//text()")
item['法人'] = resp_html.xpath("//div[@class='f18 overflow-width sec-c3']//text()")
item['注册号'] = resp_html.xpath("//table[@class='table companyInfo-table f14']/tbody/tr[1]/td[2]/text()")
print(item)
content_list.append(item)
return content_list
def save_content(self,content_list):
with open('信息.txt','a+') as f:
for content in content_list:
f.write(json.dumps(content,ensure_ascii=False))
f.write('\n')
def run(self):
url_list = self.get_url_list()
for url in url_list:
print(url)
url_content_list = self.parse_url(url)
content_list = self.parse_content(url_content_list)
self.save_content(content_list)
if __name__ == '__main__':
# login = Tylogin()
# login.get_login()
tianyan = Tianyan()
tianyan.run()