本文说明使用python程序自动抓取天眼查或者企查查上的企业基本信息。本文中的程序仅供学习参考,请通过正规渠道获取相应的数据资产。
工程目录如下:
w3目录下内容为:
工程中的部分源码如下。
文件parse.py具体抓取特定网站上的企业基本信息,因为每个数据提供商的页面DOM结构不同,会用相对应的parse实现。另外,如果页面的DOM结构发生改变,需要调整parse的源码实现。
parse.py(对应爱企查的抓取实现):
import time
from collections import namedtuple
from xml.etree.ElementTree import fromstring
import w3.common
class Data(object):
url_root = "https://aiqicha.baidu.com/"
url_path_prefix_of_key_query = "/company_detail_"
url_get = "https://aiqicha.baidu.com/company_detail_{}"
def parse_header(outer_html, key):
tel = "None"
website = "None"
address = "None"
introduce = "None"
elem = fromstring(outer_html)
try:
tel = elem[0][0][0].text.strip()
except Exception as e:
w3.common.print_key_when_error(key)
w3.common.print_stack()
try:
website = elem[1][0][0].text.strip()
except Exception as e:
w3.common.print_key_when_error(key)
w3.common.print_stack()
address = elem[1][1][0].text.strip()
try:
introduce = elem[2][0][0].tail.strip()
except Exception as e:
w3.common.print_key_when_error(key)
w3.common.print_stack()
return tel, website, address, introduce
def parse_content(outer_html, key):
legal_person = "None"
running_status = "None"
register_money = "None"
firm_group = "None"
social_id = "None"
tax_id = "None"
commercial_id = "None"
register_authorizer = "None"
register_date = "None"
firm_type = "None"
firm_business = "None"
elem = fromstring(outer_html)
legal_person = elem[0][0][1][1][0].text.strip()
try:
running_status = elem[0][0][3].text.strip()
except Exception as e:
w3.common.print_key_when_error(key)
w3.common.print_stack()
try:
register_money = elem[0][1][1].text.strip()
except Exception as e:
w3.common.print_key_when_error(key)
w3.common.print_stack()
try:
firm_group = elem[0][2][3].text.strip()
except Exception as e:
w3.common.print_key_when_error(key)
w3.common.print_stack()
social_id = elem[0][3][1].text.strip()
try:
tax_id = elem[0][3][3].text.strip()
except Exception as e:
w3.common.print_key_when_error()
w3.common.print_stack()
try:
commercial_id = elem[0][4][1].text.strip()
register_authorizer = elem[0][5][1].text.strip()
except Exception as e:
w3.common.print_key_when_error(key)
w3.common.print_stack()
register_date = elem[0][5][3].text.strip()
try:
firm_type = elem[0][6][1].text.strip()
except Exception as e:
w3.common.print_key_when_error(key)
w3.common.print_stack()
try:
firm_business = elem[0][9][1][0].text.strip()
except Exception as e:
w3.common.print_key_when_error(key)
w3.common.print_stack()
return (legal_person, running_status, register_money, firm_group, social_id, tax_id,
commercial_id, register_authorizer, register_date, firm_type, firm_business)
def parse_helper(browser, key):
import w3.common
browser.visit(Data.url_root)
time.sleep(w3.common.Data.sleep_seconds_after_launch)
browser.execute_script('document.getElementById("aqc-search-input").focus()')
search_key = browser.find_by_css("#aqc-search-input")
search_key.fill(key)
time.sleep(w3.common.Data.sleep_seconds_before_search)
submit_btn = browser.find_by_css(".search-btn")
submit_btn.click()
time.sleep(w3.common.Data.sleep_seconds_before_detail)
link_element = browser.links.find_by_partial_href(Data.url_path_prefix_of_key_query)
pid_outer_html = link_element.first.outer_html # Exception as social id does not exist. using the first one.
href = w3.common.get_href_from_outer_html(pid_outer_html, Data.url_path_prefix_of_key_query)
items = href.split("_")
pid = items[2]
url = Data.url_get.format(pid)
browser.visit(url)
time.sleep(w3.common.Data.sleep_seconds_showing_detail)
title_info = browser.find_by_css(".detail-header .header-top .header-content .content-title .name")
firm_name = title_info.first.text
head_info = browser.find_by_css(".detail-header .header-top .header-content .content-info")
head_info_outer_html = head_info.first.outer_html
header_items = parse_header(head_info_outer_html, key)
content_info = browser.find_by_css(".main .basic-tab #basic-business .zx-detail-basic-table")
content_info_outer_html = content_info.first.outer_html
content_items = parse_content(content_info_outer_html, key)
firm_info_items = namedtuple("firm_info", w3.common.Data.firm_info_keys)
firm_info_items.tel = header_items[0]
firm_info_items.website = header_items[1]
firm_info_items.address = header_items[2]
firm_info_items.introduce = header_items[3]
firm_info_items.legal_person = content_items[0]
firm_info_items.running_status = content_items[1]
firm_info_items.register_money = content_items[2]
firm_info_items.firm_group = content_items[3]
firm_info_items.social_id = content_items[4]
firm_info_items.tax_id = content_items[5]
firm_info_items.commercial_id = content_items[6]
firm_info_items.register_authorizer = content_items[7]
firm_info_items.establish_date = content_items[8]
firm_info_items.firm_type = content_items[9]
firm_info_items.firm_business = content_items[10]
firm_info_items.firm_population = "None"
firm_info_items.firm_name = firm_name
return firm_info_items
scheduler.py文件实现抓取操作的调度逻辑,可以实现Round-robin轮询抓取,也可以按照权重进行重点网站抓取,其内容为:
import sys
import time
from collections import OrderedDict
from splinter import Browser
import util
import w3.common
from w3.common import print_firm_info, print_stack, close_window
from w3.save import save_firm_data, save_failed_parse_keys, clean_files
class Config(object):
import w3.aiqicha.parse
import w3.qichacha.parse
import w3.tianyancha.parse
isp_parser_map = OrderedDict()
isp_parser_map[w3.common.Data.aqc] = w3.aiqicha.parse.parse_helper
isp_parser_map[w3.common.Data.qcc] = w3.qichacha.parse.parse_helper
isp_parser_map[w3.common.Data.tyc] = w3.tianyancha.parse.parse_helper
@staticmethod
def get_isp_parser(isp):
parser = Config.isp_parser_map.get(isp)
assert(parser is not None)
return parser
@staticmethod
def get_valid_isp():
lst = [w3.common.Data.aqc, w3.common.Data.qcc, w3.common.Data.tyc]
if w3.common.Data.which_isp in lst:
return w3.common.Data.which_isp
else:
return None
def check_key_match(index, key, item, keys_parse_failed_list, isp):
social_id = item.social_id
if key != social_id:
error_msg = "Error: not found. return irrelevant firm name:{} social id:{}.".format(item.firm_name, item.social_id)
keys_parse_failed_list.append((index, key, error_msg, isp))
if not w3.common.Data.log_quiet:
s = "Index:{} key:{} isp:{} ".format(index, key, isp)
s += error_msg
util.dump(s)
return False
else:
return True
def get_parser(index):
if w3.common.Data.which_isp is not None:
return w3.common.Data.which_isp, Config.get_isp_parser(w3.common.Data.which_isp)
r = index % (len(Config.isp_parser_map) - 1)
index = 0
for k, v in Config.isp_parser_map.items():
if r == index:
return k, v
index += 1
@util.measure("w3 parser")
def parse(keys):
from faker import Factory
faker = Factory.create()
if w3.common.Data.use_fake_browser_info:
browser = None
else:
# user_agent = faker.firefox()
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0) Gecko/20100101 Firefox/90.0"
browser = Browser(user_agent=user_agent)
latest_error_line = 0
continual_error_line_count = 0
parse_ok = 0
parse_error = 0
parse_not_found = 0
for index, key in enumerate(keys):
if index < w3.common.Data.line_index_load_start:
continue
if index >= w3.common.Data.line_index_load_stop:
break
if w3.common.Data.use_fake_browser_info:
browser = Browser(user_agent=faker.firefox())
item = get_parser(index)
isp = item[0]
parse_helper = item[1]
w3.common.Data.line_index_actually_stop = index
try:
firm_info_items = parse_helper(browser, key)
continual_error_line_count = 0
result = check_key_match(index, key, firm_info_items, w3.common.Data.keys_parse_failed_list, isp)
if result:
w3.common.Data.firm_info_items_list.append((isp, firm_info_items))
if not w3.common.Data.log_quiet:
print_firm_info(isp, index, firm_info_items)
if w3.common.Data.save_single_record:
save_firm_data([(isp, firm_info_items)], None, key)
parse_ok += 1
else:
parse_not_found += 1
if w3.common.Data.save_file_per_parsed_lines > 0 and len(w3.common.Data.firm_info_items_list) % w3.common.Data.save_file_per_parsed_lines == 0:
save_firm_data(w3.common.Data.firm_info_items_list)
save_failed_parse_keys(w3.common.Data.keys_parse_failed_list)
if index % 10 == 0:
util.dump("Index:{}".format(index))
except Exception as e:
util.dump(format("Exception start", "=^80"))
util.dump("Parse error with social id:{}".format(key))
msg = print_stack()
util.dump(format("Exception end", "-^80"))
w3.common.Data.keys_parse_failed_list.append((index, key, msg, isp))
if index == latest_error_line + 1:
continual_error_line_count += 1
else:
continual_error_line_count = 1
latest_error_line = index
parse_error += 1
finally:
if w3.common.Data.use_fake_browser_info:
close_window(browser)
if continual_error_line_count > w3.common.Data.stop_when_continual_errors:
util.dump("Pause as continual errors at the end index:{}".format(index))
if w3.common.Data.source_input_delta_file is not None:
clean_files()
if not w3.common.Data.use_fake_browser_info:
close_window(browser)
time.sleep(w3.common.Data.pause_interval_as_continual_error)
if not w3.common.Data.use_fake_browser_info:
browser = Browser()
close_window(browser)
parse_error += parse_not_found
if w3.common.Data.which_isp is None:
isp = "RR"
util.dump("ISP:{} parse succeeded count:{} error count:{} input count:{}, line start:{}, "
"line stop:{} line actually stop(include):{}.".format(isp, parse_ok, parse_error, len(keys),
w3.common.Data.line_index_load_start,
w3.common.Data.line_index_load_stop - 1,
w3.common.Data.line_index_actually_stop))
save_firm_data(w3.common.Data.firm_info_items_list)
save_failed_parse_keys(w3.common.Data.keys_parse_failed_list)
if w3.common.Data.source_input_delta_file is not None:
clean_files()
util.py文件是一些工具函数实现,其内容为:
import csv
import io
import os
import sys
import threading
from time import time
global_print_lock = threading.Lock()
def dump(s, *, ostream=sys.stdout, sys_exit=None):
with global_print_lock:
print(s, file=ostream, flush=True)
if sys_exit:
sys.exit(1)
def measure(tag):
def wrapper(f):
def func(*args, **kwargs):
ostream = kwargs.get("ostream")
str_start = "Measure {} start".format(tag)
dump(format(str_start, ">^80"), ostream=ostream, sys_exit=False)
start = time()
f(*args, **kwargs)
end = time()
elapsed = "Measure {} is over and elapsed {}".format(tag, get_readable_time(end-start))
dump(format(elapsed, "<^80"), ostream=ostream, sys_exit=False)
return func
return wrapper
def transform_text_line_to_csv_row(line):
row = None
with io.StringIO(line) as f:
f_csv = csv.reader(f)
for r in f_csv: # Only one line here.
row = r
return row
def normalize_dir_path(path):
if not (path.endswith("/") or path.endswith(os.sep)):
path += "/"
return path
def dedupe_helper(items):
seen = set()
for item in items:
if item not in seen:
yield item
seen.add(item)
def dedupe(items):
return list(dedupe_helper(items))
def get_readable_time(t):
ft = ""
h = 0
i = int(t)
m = i // 60
s = i % 60 + round(t - i, 3)
if m >= 60:
h = m // 60
m = m % 60
if h:
ft = "{} hours ".format(h)
if m:
ft += "{} minutes ".format(m)
ft += "{} seconds".format(s)
return ft