使用requests爬取内涵8的内涵段子
- 使用retrying模块进行超时重试处理
- 使用随机User-Agent和随机代理服务器 进行简单的反反爬
"""
myreq.py
模块功能:
parse_url 方法给 url 就返回 html
简单反反爬
1.随机 User-Agent
2.随机代理服务器
容错处理
1.重试
2.超时
"""
import requests
import random
from retrying import retry
USER_AGENT_LIST = [
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.1 Safari/605.1.15"
]
HTTP_PROXIES = [
'http://115.223.197.119:9000',
]
HTTPS_PROXIES = [
'https://218.60.8.99:3129'
]
@retry(stop_max_attempt_number=5)
def __parse_url(url,method='get',data={},proxies={}):
print("***请求中***")
headers = {
'User-Agent': random.choice(USER_AGENT_LIST)
}
if method=='get':
response = requests.get(
url,
headers=headers,
proxies=proxies,
timeout=2,
params=data
)
else:
response = requests.post(
url,
headers=headers,
proxies=proxies,
timeout=2,
params=data
)
response.encoding = 'utf-8'
return response.text
def parse_url(url, method='get', data={}):
"""
请求并返回html内容
:param self:
:param url:
:return:
"""
html = None
proxies = {
"http": random.choice(HTTP_PROXIES) if len(HTTP_PROXIES) > 0 else None,
"https": random.choice(HTTPS_PROXIES) if len(HTTPS_PROXIES) > 0 else None
}
try:
html = __parse_url(url,method=method,proxies=proxies,data=data)
except:
html = None
if html is None:
scheme = requests.utils.urlparse(url).scheme
if scheme == 'http':
print("当前代理无效:",proxies['http'])
HTTP_PROXIES.remove(proxies['http'])
elif scheme == 'https':
print("当前代理无效:", proxies['https'])
HTTPS_PROXIES.remove(proxies["https"])
return html
if __name__ == '__main__':
print(parse_url("https://www.baidu.com"))
- 根据返回主页爬取内涵URL列表
- 根据内涵URL列表 分别爬取各个详情页
"""
https://www.neihan8.com/e/action/ListInfo/?classid=11&page=1093
"""
import re
from html.parser import HTMLParser
from myreq import parse_url
DEBUG = False
class Neihan8Spider(object):
def __init__(self):
self.base_detail_url = 'https://www.neihan8.com'
def save_content(self,content):
"""保存数据"""
print("*" * 50)
print(content)
def run(self):
url_list = []
detail_link_pattern = re.compile(r'<a href="(.*)" class="title" title')
detail_content_pattern = re.compile(r'<div class="detail">(.*)<div class="ad610">',re.S )
detail_part_pattern = re.compile(r'<p>(.*?)</p>')
for i in range(0,1094):
url = 'https://www.neihan8.com/e/action/ListInfo/?classid=11&page={}'.format(i)
url_list.append(url)
for url in url_list:
html = parse_url(url)
detail_url_list = detail_link_pattern.findall(html)
if DEBUG:break
for detail_url in detail_url_list:
detail_html = parse_url(self.base_detail_url + detail_url)
detail_content = detail_content_pattern.findall(detail_html)
if len(detail_content)>0:
detail_content = detail_content[0]
else:
continue
content = ''
detail_part_list = detail_part_pattern.findall(detail_content)
for part in detail_part_list:
part = HTMLParser().unescape(part)
part = part.strip()
content += part + '\n'
self.save_content(content)
if DEBUG:break
if __name__ == '__main__':
spider = Neihan8Spider()
spider.run()