利用requests自动获取某网站的部分免费企业数据
由于工作需要查询一批企业的信息数据(约2000个公司),人工工作量较大,因此写了一个爬虫处理,代码如下,仅作备忘与学习
本地环境:win10、python3.8、redis7.0+、oracle10+
其中代理 ip 池的搭建借助了 https://github.com/jhao104/proxy_pool 这个项目
启动 redis 代理库
1、启动 redis 数据库
$ sudo service redis-server start
2、设置 redis 端口与调度端口
在setting.py文件中,修改 redis 地址:端口成上面启动的端口
DB_CONN = 'redis://@127.0.0.1:6379'
——此处为 redis 地址:端口
PORT = 6378
——此处为 webApi 服务端口,用于在主程序调取代理IP时访问
3、在 proxy_pool 项目内启动代理库调取服务(webApi服务)
..\proxy_pool> python proxyPool.py server
4、调用代理 IP
proxy = requests.get("http://127.0.0.1:6378/get/").json().get("proxy")
来获取代理 IP
5、更新代理 IP 池
python proxyPool.py schedule
——用于更新代理 IP池,使用详情可参考 proxy_pool 这个项目
获取数据
# -*- coding: utf-8 -*-
import requests
import re
import json
from lxml import etree
from urllib import parse
import os
from pandas import DataFrame
def get_proxy():
global proxy
global proxy_retry_times
proxy_retry_times = 0
if if_proxy:
proxy = requests.get("http://127.0.0.1:6378/get/").json().get("proxy")
def delete_proxy(proxy_failed):
if if_proxy:
requests.get("http://127.0.0.1:6378/delete/?proxy={}".format(proxy_failed))
def cus_requests(url, testify: str):
html_code = None
global proxy_retry_times
for retry in range(15): # 请求失败重试
print("<<< 重试 - {}".format(parse.unquote(url)))
try:
if if_proxy:
response = requests.get(url=url, headers=headers, proxies={"http": "http://{}".format(proxy)}, timeout=10) # 发送请求
else:
response = requests.get(url=url, headers=headers, timeout=10) # 发送请求
if response: # 响应对象为真
if response.status_code == 200: # 响应码为200
if testify in response.text:
html_code = response.text # 跳出循环
break
except Exception as e:
# 记录报错日志
log_file = open('./log.txt', 'a')
log_file.write(str(e)+'\r\n'*2)
log_file.close()
# 记录该代理ip失败次数,大于5次时更换
proxy_retry_times += 1
if proxy_retry_times > 5:
delete_proxy(proxy)
get_proxy()
if not html_code:
return # 超过最大重试次数 跳过
return html_code
def search_keyword(keyword):
url = 'http://www.tianyancha.com/search?key={}'.format(parse.quote(keyword)) # 检索链接
html_code = cus_requests(url, '_相关搜索结果')
if not html_code:
return []
search_company_items = etree.HTML(html_code).xpath(
"//a[contains(@class, 'index_alink')]") # 企业列表节点
if not search_company_items:
return []
result = list()
for element in search_company_items:
index = dict()
index['企业名称'] = ''.join(element.xpath(".//span/em/text()")) + ''.join(element.xpath(".//span/text()"))
url = ''.join(element.xpath(".//@href"))
pid = re.findall('company/(\d+)', url)
if not pid:
continue
index['url_id'] = ''.join(pid) if pid else None
result.append(index)
return result
def detail_info(res_base_item):
url = 'http://www.tianyancha.com/company/{}'.format(parse.quote(res_base_item.get("url_id"))) # 详情链接
html_code = cus_requests(url, '天眼风险')
info_parser = etree.HTML(html_code)
aaum_data = dict()
try:
# xpath_location testified in 20230131
xpath_location = \
json.loads(info_parser.xpath("//script/text()")[0])['props']['pageProps']['dehydratedState']['queries'][0][
'state']['data']['data']
except Exception:
xpath_location = dict()
aaum_data['credit_code'] = xpath_location.get('creditCode')
aaum_data['company_type'] = xpath_location.get('companyOrgType')
# aaum_data['addr'] = ''.join(info_parser.xpath(
# "//table[contains(@class,'index_tableBox')]//td[text()='注册地址']/following-sibling::td[1]//span/text()"))
aaum_data['addr'] = xpath_location.get('taxAddress')
aaum_data['open_status'] = xpath_location.get('regStatus')
aaum_data['company_name'] = res_base_item.get("企业名称")
return aaum_data
def go_spider(task_list: list, use_proxy=False):
'''
return the searching result of task_lisk as a Dataframe
the parameter 'use_proxy' needs proxy_pool in redis, see--> https://github.com/jhao104/proxy_pool
'''
global log_file
global headers
global if_proxy
if_proxy = use_proxy
get_proxy()
# 请求头
headers = {
'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Host': 'www.tianyancha.com',
'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="91", "Chromium";v="91"',
'sec-ch-ua-mobile': '?0',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'
}
if not task_list: # task_list内没有任务 退出
return
columns = ['企业名称', '营业状态', '统一社会信用代码', '企业类型', '地址', '关键词']
rowdata = []
for index,element_keyword in enumerate(task_list):
# for index, element_keyword in enumerate(['泉州城建集团有限公司', '中国航天科工防御技术研究院']):
print("<<< 开始检索 = 【{}】".format(element_keyword))
# 逐个检索
search_items = search_keyword(keyword=element_keyword)
res_base_item = None
for element in search_items:
# 判断检索列表页中的企业名称是否有跟关键词一致的
if element_keyword.replace(' ', '').lower() == element.get("企业名称").replace(' ', '').lower():
res_base_item = element
break
if not res_base_item: # 检索列表页中的企业名称没有跟关键词一致的,无匹配项
print("<<< 无匹配项 = 【{}】".format(element_keyword))
rowdata.append([element_keyword, '', '', '', '', element_keyword])
continue
# 访问详情链接
aaum_data = detail_info(res_base_item=res_base_item)
if not aaum_data:
print("<<< 无详情页 = 【{}】".format(element_keyword))
continue
print(aaum_data)
rowdata.append([aaum_data.get("company_name"), aaum_data.get("open_status"), aaum_data.get("credit_code"),
aaum_data.get("company_type"), aaum_data.get("addr"), element_keyword])
# 保存
workdata = DataFrame(rowdata, columns=columns)
return workdata
if __name__ == '__main__':
# 检测是否有task.txt文件
if not os.path.exists("./task.txt"):
os.mkdir("./task.txt") # 如果没有就创建
# 读取task.txt文件获取爬取任务
corp_file = open('./task.txt', 'r', encoding='utf-8')
corp_list = corp_file.readlines()
corp_list = [i.strip() for i in corp_list]
go_spider(corp_list)
插入数据库
from crawler import requests_tianyancha
import numpy as np
import cx_Oracle
def db_insert(data, host, port, service_name, account, password):
dsn = cx_Oracle.makedsn(host, port, service_name)
# 账号、密码、服务器
conn = cx_Oracle.connect(account, password, dsn)
writed_data = np.array(data) # dataframe转换array
writed_data = writed_data.tolist() # array转换list
sql = 'insert into CORP_DATA(COMPANY_NAME,OPEN_STATUS,CREDIT_CODE,COMPANY_TYPE,ADDR,SEARCH_KEYWORD)' \
' values(:1,:2,:3,:4,:5,:6)'
print('开始连接 {}'.format(host))
cur = conn.cursor()
print('{} 连接成功'.format(host))
try:
cur.executemany(sql, writed_data) # 批量执行query_sql
except Exception as e:
print(e)
finally:
# commit+关闭 防止锁表影响其他用户
cur.close()
conn.commit()
conn.close()
def cutted_list(task_list, max_rows=10):
# 每隔100条插入一次数据库
length = len(task_list)
left, right = 0, max_rows
while right < length:
yield task_list[left:right]
left += max_rows
right += max_rows
yield task_list[left:]
if __name__ == '__main__':
corp_file = open('./task.txt', 'r', encoding='utf-8')
corp_list = corp_file.readlines()
corp_list = [i.strip() for i in corp_list]
for temp_list in cutted_list(corp_list, max_rows=100):
tianyancha_data = requests_tianyancha.go_spider(temp_list, use_proxy=True)
db_insert(tianyancha_data, '127.0.0.1', '9999', 'ORCL', 'admin', 'pwd')
print('数据已入库')
done_task = open('./done_task.txt', 'a', encoding='utf-8')
done_task.writelines([i+'\n' for i in temp_list])
done_task.close()