要求:
存在的问题:想构造成下面这种结构
for 概览页数
构造概览url和请求参数
调用http抓取函数
调用概览解析处理得到细览列表
for 细览信息
进行细览抓取
进行细览解析得到最终结果,记录到[]
return [最终结果]
然后,翻页函数,就是构造请求url或head或body
可以是这样的:
def make_pageno(pageidx,head_dict):
返回值(url,body,错误消息)
1 概览循环就是构造翻页请求,抓取概览页面,解析得到细览列表
2 在概览循环的内部得到细览列表后,就地循环,进行细览抓取,细览解析,得到最终的info_t
import string
import requests
from urllib import request, error
import time
from bs4 import BeautifulSoup
import os
import array as arr
from lxml import etree
class info_t:
"""待入库的信息对象"""
def __init__(self, url, title, pub_time, content):
self.url = url # 信息细览URL
self.title = title # 标题
self.content = content # 正文主体内容
self.pub_time = pub_time # 信息发布时间
self.addr = None # 信息所属地域
self.keyword = None # 关键词
self.ext = None # 扩展的个性化字典信息
self.memo = None # 对此信息的备注说明
print(self.url, '\n', self.title, '\n', self.pub_time, '\n', self.content)
class Spider:
def __init__(self):
# 前置参数,有其他参数可在此配置
self.num = 2 # 默认爬取一页,此处为页数n+1
self.url = "http://ggzyjy.xzfwzx.putian.gov.cn/ptsq/005002/005002003/guidetyright.html"
self.domain = 'http://ggzyjy.xzfwzx.putian.gov.cn/' # 某些url需要进行域名拼接
self.lst_xpaths = [('divs', '//*[@class="container mt20 clearfix"]/div/div'), ('src', './span/a/@href'),
('genal_divs', '//*[@class="news-items"]/li'), ('title', './a/text()'),
('url', './a/@href')] # 概览抽取xpath前置参数
self.pge_xpaths = [('pubdate', '//*[@class="ewb-article-sources"]/p[2]/text()'),
('content', '//*[@class="ewb-article-info"]//text()')] # 细览抽取xpath前置参数
def http_take(self, url, http_method, head=None, body=None):
if http_method.upper() == 'POST':
try:
resp = requests.post(url, headers=head, body=body)
resp.headers[
'User-Agent'] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36"
print("正在进行post请求")
except Exception as e:
print("post请求出现了异常:{0}".format(e))
elif http_method.upper() == 'GET':
try:
resp = requests.get(url, headers=head)
resp.headers[
'User-Agent'] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36"
print("正在进行get请求")
resp.raise_for_status() # 若状态码不是200,抛出HTTPError异常
resp.encoding = resp.apparent_encoding # 保证页面编码正确
resp.close()
except Exception as e:
print("get请求出现了异常:{0}".format(e))
except error.HTTPError as e:
print(e.code, '\n', e.reason, '\n', e.headers)
# 若出现HTTPError尝试用e.headers替换下面的headers
except error.URLError as e:
print(e.reason)
except requests.exceptions.ConnectTimeout:
print('超时!')
except requests.exceptions.ConnectionError:
print('无效地址!')
else:
print('HTMLand Request successfully')
return resp
# 入口:通过初始url获取概览列表并调用分析和翻页函数
def start_get(self, num):
if num == 0:
num = self.num
html = etree.HTML(self.http_take(self.url, 'get').text)
divs = html.xpath(self.lst_xpaths[0][1])
for div in divs[1:]:
# 得到几个地区的每个首页概览url
total_src = self.domain + ''.join(div.xpath(self.lst_xpaths[1][1])).strip()
self.analyze_(total_src) # 对首页细览进行解析
for i in range(2, num): # 进行概览翻页
new_url = self.change_page(total_src, i)
self.analyze_(new_url) # 对后续细览页进行解析
# 对概览页和细览页的信息抽取函数
def analyze_(self, url):
html1 = etree.HTML(self.http_take(url, 'get').text)
divs1 = html1.xpath(self.lst_xpaths[2][1])
result = []
for div1 in divs1:
# 对概览进行循环,通过概览获取部分信息和细览url
title = '标题:' + ''.join(div1.xpath(self.lst_xpaths[3][1])).strip()
url = self.domain + ''.join(div1.xpath(self.lst_xpaths[4][1])).strip()
try:
resp2 = requests.get(url, timeout=(3, 7))
resp2.close()
except AttributeError as e:
print('请检查属性是否有误')
except TypeError as e:
print('类型错误')
except ValueError as e:
print('参数无效')
else:
print('xpath successfully')
html2 = etree.HTML(self.http_take(url, 'get').text)
# 对细览页进行解析
pub_time = ''.join(html2.xpath(self.pge_xpaths[0][1])).strip()
content = ''.join(html2.xpath(self.pge_xpaths[1][1])).strip()
# result.extend([url, title, pub_time, content])
info_t(url, title, pub_time, content)
# info_t(result)
# return result
# return url
# 翻页的个性化函数
def change_page(self, urls, num):
urls = urls.strip('\n')
urls = urls.strip(string.punctuation)
url = urls.replace('subPageright', '%d')
new_url = format(url % num)
# self.analyze_(new_url)
return new_url
if __name__ == '__main__':
s = Spider()
# 此处配置所需页数:需要n页输入n+1,默认情况此处参数为0,自动爬取一页
s.start_get(0)