import time
import requests
from lxml import etree
from multiprocessing.dummy import Pool
from requests.exceptions import RequestException
import openpyxl
def get_one_page(url):
try:
res = requests.get(url,headers = headers)
if res.status_code == 200:
return res.text
return None
except RequestException:
return None
def parse_one_page(html):
# 构造HTML解析器
info_list=[]
ii_list = html.xpath('//div[@class="article-item-box csdn-tracking-statistics"]')
for ii in ii_list:
try:
##提取
type_ = ii.xpath('./h4/a/span/text()')[0]
title = ii.xpath('./h4/a/text()')[1].strip()
date = ii.xpath('.//span[@class="date"]/text()')[0].strip()
read_num = ii.xpath('.//span[@class="read-num"]/text()')[0]
sheet.append([type_, title, date,read_num])
except Exception:
pass
def main(offset):
# 构造主函数,初始化各个模块,传入入口URL
base_url = 'https://blog.csdn.net/qq_41185868/article/list/{}'
url = base_url.format(offset)
html = etree.HTML(get_one_page(url))
parse_one_page(html)
if __name__ == '__main__':
wb = openpyxl.Workbook() # 获取工作簿对象
sheet = wb.active # 活动的工作表
# 添加列名
sheet.append(['type_', 'title', 'date', 'read_num'])
# 请求头
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
+'Chrome/62.0.3202.94 Safari/537.36'}
# 使用线程池
print('多线程爬取开始')
start_time=time.time()
p = Pool(8)
p.map(main,[i for i in range(1,50)])
# 保存位置
wb.save(r'C:\Users\Administrator\Desktop\info.xlsx')
#关闭线程池
end_time=time.time()
print('多线程爬取结束')
print('耗时:',end_time-start_time)
p.close()
p.join()#用来等待进程池中的worker进程执行完毕,防止主进程在worker进程结束前结束。
解决url不变情况
来自:Python轻松实现动态网页爬虫
from urllib.parse import urlencode
import csv
import random
import requests
import traceback
from time import sleep
from lxml import etree #lxml为第三方网页解析库,强大且速度快
base_url = 'http://www.hshfy.sh.cn/shfy/gweb2017/ktgg_search_content.jsp?' #这里要换成对应Ajax请求中的链接
headers = {
'Connection': 'keep-alive',
'Accept': '*/*',
'X-Requested-With': 'XMLHttpRequest',
'User-Agent': '******',
'Origin': 'http://www.hshfy.sh.cn',
'Referer': 'http://www.hshfy.sh.cn/shfy/gweb2017/ktgg_search.jsp?zd=splc',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Content-Type': 'application/x-www-form-urlencoded',
'Cookie': '******'
}
def get_page(page):
n = 3
while True:
try:
#sleep(random.uniform(1, 2)) # 随机出现1-2之间的数,包含小数
data = {
'yzm': 'Vewv',
'ft':'',
'ktrqks': '2020-10-22',
'ktrqjs': '2020-11-22',
'spc':'',
'yg':'',
'bg':'',
'ah':'',
'pagesnum': page
}
url = base_url + urlencode(data)
print(url)
try:
response = requests.request("POST",url, headers = headers)
#print(response)
if response.status_code == 200:
re = response.content.decode('gbk')
print(re)
return re # 解析内容
except requests.ConnectionError as e:
print('Error', e.args) # 输出异常信息
except (TimeoutError, Exception):
n -= 1
if n == 0:
print('请求3次均失败,放弃此url请求,检查请求条件')
return
else:
print('请求失败,重新请求')
continue
def parse_page(html):
try:
parse = etree.HTML(html) # 解析网页
items = parse.xpath('//*[@id="report"]/tbody/tr')
for item in items[1:]:
item = {
'a': ''.join(item.xpath('./td[1]/font/text()')).strip(),
'b': ''.join(item.xpath('./td[2]/font/text()')).strip(),
'c': ''.join(item.xpath('./td[3]/text()')).strip(),
'd': ''.join(item.xpath('./td[4]/text()')).strip(),
'e': ''.join(item.xpath('./td[5]/text()')).strip(),
'f': ''.join(item.xpath('./td[6]/div/text()')).strip(),
'g': ''.join(item.xpath('./td[7]/div/text()')).strip(),
'h': ''.join(item.xpath('./td[8]/text()')).strip(),
'i': ''.join(item.xpath('./td[9]/text()')).strip()
}
#print(item)
try:
with open('C:\\Users\\Administrator\\Desktop\\law.csv', 'a', encoding='utf_8_sig', newline='') as fp:
# 'a'为追加模式(添加)
# utf_8_sig格式导出csv不乱码
fieldnames = ['a', 'b', 'c', 'd', 'e','f','g','h','i']
writer = csv.DictWriter(fp,fieldnames)
writer.writerow(item)
except Exception:
print(traceback.print_exc()) #代替print e 来输出详细的异常信息
except Exception:
print(traceback.print_exc())
if __name__ == '__main__':
for page in range(1,3): #这里设置想要爬取的页数
html = get_page(page)
#print(html)
results = parse_page(html)
print("第" + str(page) + "页提取完成")