python基础（5） —— python爬虫实战

最新推荐文章于 2024-05-29 15:23:22 发布

帆_5021

最新推荐文章于 2024-05-29 15:23:22 发布

阅读量210

点赞数

分类专栏： python基础

本文链接：https://blog.csdn.net/weixin_43093289/article/details/105895976

版权

python基础专栏收录该内容

6 篇文章 0 订阅

订阅专栏

1. 反爬策略之代理IP

首先需要购买代理IP

import urllib.request as ur

proxy_address = ur.urlopen('http://api.ip.data5u.com/dynamic/get.html?order=d314e5e5e19b0dfd19762f98308114ba&sep=4').read().decode('utf-8').strip()
print(proxy_address) # 115.207.36.202:38979(代理IP有时效性，过期后需要续费)

# 创建proxy_handler
proxy_handler = ur.ProxyHandler(
    {
        'http':proxy_address
    }
)
# 新建opener对象
proxy_opener = ur.build_opener(proxy_handler)

request = ur.Request(url='https://edu.csdn.net/')

# 通过opener对象访问request的网址
reponse = proxy_opener.open(request).read()
print(reponse)

2. 反爬策略之模拟登录

import urllib.request as ur
import user_agent
import lxml.etree as le

request = ur.Request(
    url='https://edu.csdn.net/mycollege',
    # 必须包含Cookie，否则无法模拟登录
    headers={
        'User-Agent':user_agent.get_user_agent_pc(),
        'Cookie':'uuid_tt_dd=10_291046340-1549030766199-774346; smidV2=2019032808063205c4b158a28825bdfbb6108edc9a5f510091c3fec0aad6150; ADHOC_MEMBERSHIP_CLIENT_ID1.0=39699c49-111c-7665-4452-b270ee17f1b4; _ga=GA1.2.924224088.1556522733; UM_distinctid=16a6800cde7335-02ca3a55a6d7aa-3a614f0b-1fa400-16a6800cde9373; UN=kzl_knight; Hm_ct_6bcd52f51e9b3dce32bec4a3997715ac=6525*1*10_291046340-1549030766199-774346!5744*1*kzl_knight!1788*1*PC_VC; dc_session_id=10_1563242124060.407147; UserName=kzl_knight; UserInfo=87b6c178121e4ebb9c698b6c3bbcdcd9; UserToken=87b6c178121e4ebb9c698b6c3bbcdcd9; UserNick=kzl_knight; AU=5AA; BT=1564751420031; p_uid=U000000; Hm_ct_e5ef47b9f471504959267fd614d579cd=5744*1*kzl_knight!6525*1*10_291046340-1549030766199-774346; __yadk_uid=b07B7nB4DNxlZBhsPmydqZ8oixohVRvY; TY_SESSION_ID=df97a907-a50c-4bca-a926-4b9edd956598; cname11736=1; Hm_lvt_6bcd52f51e9b3dce32bec4a3997715ac=1565085616,1565100208,1565143666,1565148547; Hm_lpvt_6bcd52f51e9b3dce32bec4a3997715ac=1565156088; dc_tos=pvuq60',
    }
)

response = ur.urlopen(request).read().decode('utf-8')
# with open('mycollege2.html','w',encoding='utf-8') as f:
#     f.write(response)

html_x = le.HTML(response)
title_s = html_x.xpath('//li[@class="item_box"]//h1/a/text()')
print(title_s)

3. CSDN热门文章抓取

import urllib.request as ur
import lxml.etree as le
import user_agent

keyword = input('请输入关键词:')
pn_start = int(input('起始页:'))
pn_end = int(input('终止页:'))

def getRequest(url):
    return ur.Request(
        url=url,
        headers={
            'User-Agent':user_agent.get_user_agent_pc(),
        }
    )
'''
抽象方法创建opener对象
通过代理IP
strip()：去掉空格
'''
def getProxyOpener():
    proxy_address = ur.urlopen('http://api.ip.data5u.com/dynamic/get.html?order=d314e5e5e19b0dfd19762f98308114ba&sep=4').read().decode('utf-8').strip()
    proxy_handler = ur.ProxyHandler(
        {
            'http':proxy_address
        }
    )
    return ur.build_opener(proxy_handler)


for pn in range(pn_start,pn_end+1):
    request = getRequest(
        'https://so.csdn.net/so/search/s.do?p=%s&q=%s&t=blog&domain=&o=&s=&u=&l=&f=&rbg=0' % (pn,keyword)
    )
    try:
        response = getProxyOpener().open(request).read()
        href_s = le.HTML(response).xpath('//span[@class="down fr"]/../span[@class="link"]/a/@href')
        for href in href_s:
            try:
                response_blog = getProxyOpener().open(
                    getRequest(href)
                ).read()
                # 通过XPath爬取信息
                title = le.HTML(response_blog).xpath('//h1[@class="title-article"]/text()')[0]
                print(title)
                # 将爬取的网页存储到指定文件夹（以title命名的.html文件）
                with open('blog/%s.html' % title,'wb') as f:
                    f.write(response_blog)
            except Exception as e:
                print(e) # 打印错误信息
    except:pass

帆_5021

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
python基础（5） —— python爬虫实战

1. 反爬策略之代理IP首先需要购买代理IPimport urllib.request as urproxy_address = ur.urlopen('http://api.ip.data5u.com/dynamic/get.html?order=d314e5e5e19b0dfd19762f98308114ba&sep=4').read().decode('utf-8').str...
复制链接

扫一扫

专栏目录