前言
好大夫在线网(https://www.haodf.com),是封ip的,当然通过(高匿)代理IP一般就能解决。but恶心的地方在于该站使用了知道创宇的云拦截,能够获取机子的真实ip,使用代理ip并没有什么卵用。
爬取源及爬取内容
爬取源是11万条医生页面的url,获取页面的一些访问信息。
目录结构
代码
import requests
from lxml import etree
import math
import threading
import random
import time
import datetime
import csv
import gc
# 写文件
def write_file(path_file, mode, write_str):
with open(path_file, mode) as file:
file.write(write_str)
# 写数据到csv中
def write_csv(path_file, mode, list_row):
with open(path_file, mode, newline='') as csv_file:
csv_writer = csv.writer(csv_file)
csv_writer.writerow(list_row)
# 读取url文件
def read_file(path_file):
with open(path_file, 'r') as file:
lines = file.readlines()
return lines
# 把所有的ulr分成n等份
def chunks(list, n):
chunks_list = []
len_list = len(list)
step = math.ceil(len_list / n)
for i in range(0, n):
chunks_list.append(list[i*step:(i+1)*step])
return chunks_list
# 获取页面
def get_page(url):
User_Agent = [
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.9.168 Version/11.50',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)'
]
len_user_agent = len(User_Agent)
random_num = random.randint(0, len_user_agent-1)
user_agent = User_Agent[random_num]
count = 0
status_code = 403
while status_code == 403:
try:
response = requests.get(
url=url,
proxies={
# 'http': 'http://c4b10796877647f297db63ecf2f92428:@proxy.crawlera.com:8010/',
},
headers={
'User-Agent': user_agent
}
)
response.encoding = 'cp936'
html = response.text
status_code = response.status_code
count += 1
time.sleep(count*3)
# print(response.status_code)
# print(str(count)+' ' + url)
except:
html = ''
if count > 30:
break
return html, url, status_code
# 获取页面信息
def get_info(html, url):
info = list()
info.append(url)
selector = etree.HTML(html)
try:
# 推荐热度
try:
recommend = selector.xpath('//span[@class="patient_recommend"]/i/text()')[0]
except:
recommend = ''
info.append(recommend)
# 总访问
visits = selector.xpath('//ul[@class="space_statistics"]/li[1]/span/text()')[0]
info.append(visits)
# 昨日访问
visits_yesterday = selector.xpath('//ul[@class="space_statistics"]/li[2]/span/text()')[0]
info.append(visits_yesterday)
# 昨日访问日期
visits_yesterday_date = selector.xpath('//ul[@class="space_statistics"]/li[2]/text()')[1]
visits_yesterday_date = visits_yesterday_date.replace('次(', '').replace(')', '')
info.append(visits_yesterday_date)
# 总文章
articles = selector.xpath('//ul[@class="space_statistics"]/li[3]/span/text()')[0]
info.append(articles)
# 总患者
patients = selector.xpath('//ul[@class="space_statistics"]/li[4]/span/text()')[0]
info.append(patients)
# 昨日诊后报到患者
patients_after_yesterday = selector.xpath('//ul[@class="space_statistics"]/li[5]/span/text()')[0]
info.append(patients_after_yesterday)
# 微信诊后报到患者
patients_after_wechat = selector.xpath('//ul[@class="space_statistics"]/li[6]/span/text()')[0]
info.append(patients_after_wechat)
# 总诊后报到患者
patients_after = selector.xpath('//ul[@class="space_statistics"]/li[7]/span/text()')[0]
info.append(patients_after)
# 患者投票
votes_patient = selector.xpath('//ul[@class="space_statistics"]/li[8]/span/text()')[0]
info.append(votes_patient)
# 感谢信
letters_thanks = selector.xpath('//ul[@class="space_statistics"]/li[9]/span/text()')[0]
info.append(letters_thanks)
# 心意礼物
gifts = selector.xpath('//ul[@class="space_statistics"]/li[10]/span/text()')[0]
info.append(gifts)
# 上次在线
online_last = selector.xpath('//ul[@class="space_statistics"]/li[11]/span/text()')[0]
info.append(online_last)
# 开通时间
opening_time = selector.xpath('//ul[@class="space_statistics"]/li[12]/span/text()')[0]
info.append(opening_time)
except:
pass
# print('craw failed, try again')
return info
# index 为线程 序号
def craw(index, chunks_list, path_log_file):
url_list = chunks_list[index]
for url in url_list:
url = url.replace('\n', '')
html, url, status_code = get_page(url)
info = get_info(html, url)
if len(info) == 1:
# 有问题记录日志
write_file(path_log_file, 'a', url+'\n')
else:
# 数据写入
# 系统时间
now_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
info.insert(1, now_time)
write_csv(path_file=path_data, mode='a', list_row=info)
print(info)
# 回收垃圾
del info
del url_list
gc.collect()
def main(path_url_file, n, path_log_file):
# 读取url文件
list_url = read_file(path_url_file)
chunks_list = chunks(list_url, n)
thread_list = []
# 把url分成n等份,也即n个线程
for index in range(0, n):
thread = threading.Thread(target=craw, args=(index, chunks_list, path_log_file))
thread_list.append(thread)
for t in thread_list:
t.setDaemon(True)
thread.start()
for t in thread_list:
t.join()
if __name__ == '__main__':
path_url = './files/URL.csv'
path_data = './files/data.csv'
path_log_temp = './files/log_temp.txt'
path_log = './files/log.txt'
# 把日志文件清空
write_file(path_file=path_log_temp, mode='w', write_str='')
write_file(path_file=path_log, mode='w', write_str='')
# 写标题到csv中
title_row = [
'URL', '爬取时间', '热度推荐', '总访问', '昨日访问', '昨日访问日期', '总文章', '总患者', '昨日诊后报到患者',
'微信诊后报到患者', '总诊后报到患者', '患者投票', '感谢信', '心意礼物', '上次在线', '开通时间'
]
write_csv(path_file=path_data, mode='w', list_row=title_row)
# 处理url文件
main(path_url_file=path_url, n=200, path_log_file=path_log_temp)
结果