# -*- coding: utf-8 -*-
"""
分析:
#流程:
# 1.3 访问详情数据链接获取详情页数据
# 2. 解析数据
# 3. 提取数据
# 4. 保存数据
pass
第1页 http://www.neihanpa.com/article/index_{}.html
从 0 开始
http://www.neihanpa.com/e/action/ListInfo/?classid=11&page={}
爬取流程
循环获取列表页 -> 再访问详情页 -> 提取标题和内容
"""
import requests
from html.parser import HTMLParser
import re
from myreq import parse_url #自己写的一个页面数据获取方面,详细代码在下面
from pprint import pprint
DEBUG = False
class NeihanpaSpider:
def __init__(self):
self.base_url = "http://www.neihanpa.com"
pass
def save_content(self,content):
print("*" * 50)
print(content)
pass
def run(self):
html_parser = HTMLParser()
# 定义提取列表页中详情页的超链接
detail_link_pattern = re.compile('<a href="(.*?)" class="title" title=')
# 定义初步提取详情内容的正则
detail_content_pattern = re.compile('<div class="detail">(.*?)<div class="art_newding">',re.RegexFlag.S)
# 定义精准提取的正则
detail_part_pattern = re.compile('<p>(.*?)</p>',re.RegexFlag.S)
# 1. 获取数据
# 1.1 获取列表 html 数据
url_list = []
for page in range(0,1094,1):
url = "http://www.neihanpa.com/e/action/ListInfo/?classid=11&page={}".format(page)
url_list.append(url)
# 1.2 获取列数据中的 详情数据 链接
for url in url_list:
list_html = parse_url(url)
detail_links = detail_link_pattern.findall(list_html)
# 依次获取详情页内容
for detail_link in detail_links:
detail_html = parse_url(self.base_url + detail_link)
if detail_html is None:
# 把发生错误的url记录到日志中
pass
# 1. 初步提取内容块
detail_content_html = detail_content_pattern.findall(detail_html)
if len(detail_content_html) > 0:
detail_content_html = detail_content_html[0]
else:
continue
# 2. 从内容块中提取 p 标签具体内容
content = ""
parts = detail_part_pattern.findall(detail_content_html)
for part in parts:
part = html_parser.unescape(part)
part = part.strip()
content = content + part + '\n'
self.save_content(content)
if DEBUG:break
if DEBUG:break
if __name__ == '__main__':
spider = NeihanpaSpider()
spider.run()
#在另外一个文件自定义方法:
#!/usr/bin/python3
# -*- coding: utf-8 -*-
'''
简单反爬
1. 随机 User-Agent
2. 随机代理服务器
容错处理
1. 重试
2. 超时
'''
'''
模块功能:
parse_url 方法 给 url 就返回 html
'''
import requests
import random
from retrying import retry
# 随机 User-Agent
USER_AGENT_LIST = [
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.1 Safari/605.1.15"
]
# 定义 代理服务器
# 未来 2个 代理服务器列表可能从第三方购买的
HTTP_PROXIES = [
# "http://115.223.204.126:9000",
# "http://118.117.136.114:9000"
]
HTTPS_PROXIES = [
# "https://106.56.102.8:808"
]
@retry(stop_max_attempt_number=5)
def __parse_url(url,method='get',data={},proxies={}):
print("***请求中***")
headers = {
"User-Agent":random.choice(USER_AGENT_LIST)
}
# 使用 timeout设置超时
if method == 'get':
response = requests.get(
url,
headers=headers,
proxies=proxies,
timeout=2,
params=data
)
else:
response = requests.post(
url,
headers=headers,
proxies=proxies,
timeout=2,
data=data
)
response.encoding = 'utf-8'
return response.text
def parse_url(url,method='get',data={}):
'''
请求并返回html内容
:param url:
:return:
'''
html = None
proxies = {
"http":random.choice(HTTP_PROXIES) if len(HTTP_PROXIES) > 0 else None,
"https":random.choice(HTTPS_PROXIES) if len(HTTPS_PROXIES) > 0 else None
}
try:
html = __parse_url(url, method=method,proxies=proxies,data=data)
except:
html = None
if html is None:
scheme = requests.utils.urlparse(url).scheme
if scheme == 'http':
HTTP_PROXIES.remove(proxies["http"])
elif scheme == 'https':
HTTPS_PROXIES.remove(proxies["https"])
return html
if __name__ == '__main__':
print(parse_url("https://www.baidu.com"))