1. 反爬策略之代理IP
首先需要购买代理IP
import urllib.request as ur
proxy_address = ur.urlopen('http://api.ip.data5u.com/dynamic/get.html?order=d314e5e5e19b0dfd19762f98308114ba&sep=4').read().decode('utf-8').strip()
print(proxy_address) # 115.207.36.202:38979(代理IP有时效性,过期后需要续费)
# 创建proxy_handler
proxy_handler = ur.ProxyHandler(
{
'http':proxy_address
}
)
# 新建opener对象
proxy_opener = ur.build_opener(proxy_handler)
request = ur.Request(url='https://edu.csdn.net/')
# 通过opener对象访问request的网址
reponse = proxy_opener.open(request).read()
print(reponse)
2. 反爬策略之模拟登录
import urllib.request as ur
import user_agent
import lxml.etree as le
request = ur.Request(
url='https://edu.csdn.net/mycollege',
# 必须包含Cookie,否则无法模拟登录
headers={
'User-Agent':user_agent.get_user_agent_pc(),
'Cookie':'uuid_tt_dd=10_291046340-1549030766199-774346; smidV2=2019032808063205c4b158a28825bdfbb6108edc9a5f510091c3fec0aad6150; ADHOC_MEMBERSHIP_CLIENT_ID1.0=39699c49-111c-7665-4452-b270ee17f1b4; _ga=GA1.2.924224088.1556522733; UM_distinctid=16a6800cde7335-02ca3a55a6d7aa-3a614f0b-1fa400-16a6800cde9373; UN=kzl_knight; Hm_ct_6bcd52f51e9b3dce32bec4a3997715ac=6525*1*10_291046340-1549030766199-774346!5744*1*kzl_knight!1788*1*PC_VC; dc_session_id=10_1563242124060.407147; UserName=kzl_knight; UserInfo=87b6c178121e4ebb9c698b6c3bbcdcd9; UserToken=87b6c178121e4ebb9c698b6c3bbcdcd9; UserNick=kzl_knight; AU=5AA; BT=1564751420031; p_uid=U000000; Hm_ct_e5ef47b9f471504959267fd614d579cd=5744*1*kzl_knight!6525*1*10_291046340-1549030766199-774346; __yadk_uid=b07B7nB4DNxlZBhsPmydqZ8oixohVRvY; TY_SESSION_ID=df97a907-a50c-4bca-a926-4b9edd956598; cname11736=1; Hm_lvt_6bcd52f51e9b3dce32bec4a3997715ac=1565085616,1565100208,1565143666,1565148547; Hm_lpvt_6bcd52f51e9b3dce32bec4a3997715ac=1565156088; dc_tos=pvuq60',
}
)
response = ur.urlopen(request).read().decode('utf-8')
# with open('mycollege2.html','w',encoding='utf-8') as f:
# f.write(response)
html_x = le.HTML(response)
title_s = html_x.xpath('//li[@class="item_box"]//h1/a/text()')
print(title_s)
3. CSDN热门文章抓取
import urllib.request as ur
import lxml.etree as le
import user_agent
keyword = input('请输入关键词:')
pn_start = int(input('起始页:'))
pn_end = int(input('终止页:'))
def getRequest(url):
return ur.Request(
url=url,
headers={
'User-Agent':user_agent.get_user_agent_pc(),
}
)
'''
抽象方法创建opener对象
通过代理IP
strip():去掉空格
'''
def getProxyOpener():
proxy_address = ur.urlopen('http://api.ip.data5u.com/dynamic/get.html?order=d314e5e5e19b0dfd19762f98308114ba&sep=4').read().decode('utf-8').strip()
proxy_handler = ur.ProxyHandler(
{
'http':proxy_address
}
)
return ur.build_opener(proxy_handler)
for pn in range(pn_start,pn_end+1):
request = getRequest(
'https://so.csdn.net/so/search/s.do?p=%s&q=%s&t=blog&domain=&o=&s=&u=&l=&f=&rbg=0' % (pn,keyword)
)
try:
response = getProxyOpener().open(request).read()
href_s = le.HTML(response).xpath('//span[@class="down fr"]/../span[@class="link"]/a/@href')
for href in href_s:
try:
response_blog = getProxyOpener().open(
getRequest(href)
).read()
# 通过XPath爬取信息
title = le.HTML(response_blog).xpath('//h1[@class="title-article"]/text()')[0]
print(title)
# 将爬取的网页存储到指定文件夹(以title命名的.html文件)
with open('blog/%s.html' % title,'wb') as f:
f.write(response_blog)
except Exception as e:
print(e) # 打印错误信息
except:pass