1.urllib爬取数据

基本步骤:

  1. 发送请求

  2. 提取信息

  3. 保存信息

一.发送请求

导入包,发送基本请求

# 导入包
import urllib.request
# 发送请求
response = urllib.request.urlopen(url) 模拟浏览器向服务器发送请求
# 获取响应数据,注意需要解码.decode('utf-8')
read() 		# 字节形式读取二进制 扩展:rede(5)返回前几个字节
readline() 	# 读取一行
readlines() # 一行一行读取 直至结束
getcode() 	# 获取状态码
geturl() 	# 获取url
getheaders()# 获取headers
# 下载请求的文件
urllib.request.urlretrieve()

1.定制get请求

1中文需要编码urllib.parse.urlencode(data)

2字符串需要拼接

# 导入包
import urllib.parse
# 定制url
base_url = 'http://www.baidu.com/s?'
data = {
    'name':'小刚',
    'sex':'男',
}
data = urllib.parse.urlencode(data) # 此处为将参数中的中文编码,并且用&符号连接
url = base_url + data
# 定制headers
headers = {
    'User‐Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, likeGecko) Chrome/74.0.3729.169 Safari/537.36'
}

案例:爬取豆瓣数据

import urllib.request
import urllib.parse

# 获取请求
def create_request(page):
    base_url = "https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=&"
    data = {
        "start": (page - 1) * 20,
        "limit": 20
    }
    url = base_url + urllib.parse.urlencode(data)
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36"
    }
    request = urllib.request.Request(url=url, headers=headers)
    return request

# 发送请求
def get_content(request):
    response = urllib.request.urlopen(request)
    content = response.read().decode("utf-8")
    return content

# 下载文档
def down_load(page: int, content: str):
    with open("data/douban_" + str(page) + ".json", "w", encoding="utf-8") as file:
        file.write(content)


# 程序的入口
if __name__ == "__main__":
    start_page = int(input("请输入起始页码: "))
    end_page = int(input("请输入结尾页码: "))
    for page in range(start_page, end_page + 1):
        request = create_request(page)
        content = get_content(request)
        down_load(page, content)

2.定制post请求

1表单中文需要编码urllib.parse.urlencode(data)

2表单编码之后需要再编码为2进制.encode('utf‐8')

# 导入包
import urllib.parse
# 定制url
url = 'https://fanyi.baidu.com/sug'
# 定制headers
headers = {
    'user‐agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, likeGecko) Chrome/74.0.3729.169 Safari/537.36'
}
keyword = input('请输入您要查询的单词')
# 定制表单
data = {
    'kw':keyword
}
data = urllib.parse.urlencode(data).encode('utf‐8') # 注意表单需要编码,再编码为二进制

案例:爬取肯德基数据

import urllib.request
import urllib.parse

# 获取请求
def create_request(page):
    url = "http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=cname"
    data = {
        'cname': '北京',
        'pid': None,
        'pageIndex': page,
        'pageSize': 10
    }
    data = urllib.parse.urlencode(data).encode("utf-8")
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36"
    }
    request = urllib.request.Request(url=url, data=data, headers=headers)
    return request

# 发送请求
def get_content(request):
    response = urllib.request.urlopen(request)
    content = response.read().decode("utf-8")
    return content

# 下载文档
def down_load(page: int, content: str):
    with open("data/kfc_" + str(page) + ".json", "w", encoding="utf-8") as file:
        file.write(content)


# 程序的入口
if __name__ == "__main__":
    start_page = int(input("请输入起始页码: "))
    end_page = int(input("请输入结尾页码: "))
    for page in range(start_page, end_page + 1):
        request = create_request(page)
        content = get_content(request)
        down_load(page, content)

3.Handler请求(代理)

意义:可以使用动态cookie和ip代理

# 1.获取HTTPHandler对象
handler = urllib.request.HTTPHandler()
# 2.获取opener
opener = urllib.request.build_opener(handler)
# 3.访问请求
response = opener.open(request)

ip代理

def get_content_from_proxy(request):
    proxies = {
        'http': '27.42.168.46:55481'
    }
    # 1.创建ProxyHandler
    handler = urllib.request.ProxyHandler(proxies=proxies)
    # 2.获取opener
    opener = urllib.request.build_opener(handler)
    # 3.访问请求
    response = opener.open(request)
    content = response.read().decode("utf-8")
    return content

二.解析网页

1.xpath: 提取xml文件

浏览器安装插件xpath

Pycharm下载包lxml

1)解析本地文件

# 导入包
from lxml import etree

# 解析文件
tree = etree.parse("data/ip.html")
# 提取信息
li_list = tree.xpath('//li/text()')

提取的语法:

xpath基本语法:
    1.路径查询
    //:查找所有子孙节点,不考虑层级关系
    / :找直接子节点
    2.谓词查询
    //div[@id]
    //div[@id="maincontent"]
    3.属性查询
    //@class
    4.模糊查询
    //div[contains(@id, "he")]
    //div[starts‐with(@id, "he")]
    5.内容查询
    //div/h1/text()
    6.逻辑运算
    //div[@id="head" and @class="s_down"]
    //title | //price

2)解析爬取的网页

# 获取网页内容
response = urllib.request.urlopen(request)
content = response.read().decode("utf-8")
# 解析网页
tree = etree.HTML(content)
# 提取信息
data_lsit = tree.xpath('//input[@id="su"]/@value')

案例:爬取图片数据

import urllib.request
from lxml import etree
from tqdm import *
"""
第一页地址:
https://sc.chinaz.com/tupian/qinglvtupian.html

第二页地址:
https://sc.chinaz.com/tupian/qinglvtupian_2.html
"""


def create_request(page):
    if page == 1:
        url = "https://sc.chinaz.com/tupian/qinglvtupian.html"
    else:
        url = "https://sc.chinaz.com/tupian/qinglvtupian_" + str(page) + ".html"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'
    }
    request = urllib.request.Request(url=url, headers=headers)
    return request


def get_content(request):
    req = urllib.request.urlopen(request)
    content = req.read().decode("utf-8")
    return content


def down_load(content):
    tree = etree.HTML(content)
    link_list = tree.xpath('//div[@class="tupian-list com-img-txt-list"]//img/@data-original')
    name_list = tree.xpath('//div[@class="tupian-list com-img-txt-list"]//img/@alt')
    for i in tqdm(range(len(link_list)), desc="下载的图片"):
        link = "https:" + link_list[i]
        name = name_list[i]
        urllib.request.urlretrieve(link, filename="./photo/" + name + ".jpg")


if __name__ == "__main__":
    start_page = int(input("请输入起始页码:"))
    end_page = int(input("请输入结束页码:"))
    for page in range(start_page, end_page + 1):
        # 定制请求对象
        request = create_request(page)
        # 请求数据
        content = get_content(request)
        # 下载图片
        down_load(content)

2.jsonPath:提取json文件

使用方法:

# jsonpath的使用:
obj = json.load(open('json文件', 'r', encoding='utf‐8'))
ret = jsonpath.jsonpath(obj, 'jsonpath语法')

匹配的语法

案例:读取淘票票的城市数据

url = "https://dianying.taobao.com/cityAction.json?activityId&_ksTS=1665842537566_108&jsoncallback=jsonp109&action=cityAction&n_s=new&event_submit_doGetAllRegion=true"
headers = {
    # ':authority': 'dianying.taobao.com',
    # ':method': 'GET',
    # ':path': '/cityAction.json?activityId&_ksTS=1665842537566_108&jsoncallback=jsonp109&action=cityAction&n_s=new&event_submit_doGetAllRegion=true',
    # ':scheme': 'https',
    'accept': 'text/javascript, application/javascript, application/ecmascript, application/x-ecmascript, */*; q=0.01',
    # 'accept-encoding': 'gzip, deflate, br',
    'accept-language': 'zh-CN,zh;q=0.9',
    'bx-v': '2.2.3',
    'cookie': 't=01e264dc462c7ec31fa81c964480ef71; cna=PQQNG2ID00wCAcom+Kx+6VFW; sgcookie=E100NV67fuPefRFxmJ0rvIhweAZoab6bysNUqOllPpeLt7x9bpzqS%2BymW2%2Bx44dTkXaSrvn9kbzwh6Zx%2BIsDmj8dYMBa5bqwE2Skvf1Cy6Xrvyysien1uEZxct1eOkjuavSS; tracknick=%5Cu674E%5Cu4E8C%5Cu5E06%5Cu5475%5Cu5475%5Cu5475; _cc_=VFC%2FuZ9ajQ%3D%3D; cookie2=1dfb433f9bd17bd74b43a203c5d6e815; v=0; _tb_token_=577eb3bb87f08; xlly_s=1; tb_city=110100; tb_cityName="sbG+qQ=="; tfstk=cfK1BVYee5V61Om0j1ME_WX9wgscZBw5hV1MCX7g6-W3mBJ1iirPN2i9oSIVH91..; l=eBQNq69gL8OhvXN6BO5Cnurza7792QRb4sPzaNbMiInca6iATFaNYNCU_UxJ7dtjgtCAuetzv3EoxdLHR3AgCc0c07kqm0SZUxvO.; isg=BGVlVF3rai5ub480JRB2COKOdCGfohk0ct6phGdIkhyrfoXwL_CvBHoQCOII_jHs',
    'referer': 'https://dianying.taobao.com/',
    'sec-ch-ua': '"Chromium";v="106", "Google Chrome";v="106", "Not;A=Brand";v="99"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"Windows"',
    'sec-fetch-dest': 'empty',
    'sec-fetch-mode': 'cors',
    'sec-fetch-site': 'same-origin',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36',
    'x-requested-with': 'XMLHttpRequest'
}
# 创建请求
req = urllib.request.Request(url=url, headers=headers)
# 获取数据
response = urllib.request.urlopen(req)
content = response.read().decode("utf-8")
content = content.split("(")[1].split(")")[0]
# 提取信息
json_data = json.loads(content)
data_list = jsonpath.jsonpath(json_data, "$..regionName")
for i in data_list:
    print(i)

3.BeautifulSoup:解析xml

基本使用

# 1导入包
from bs4 import BeautifulSoup

# 2创建对象
# 服务器响应的文件生成对象
soup = BeautifulSoup(response.read().decode(), 'lxml')
# 本地文件生成对象
soup = BeautifulSoup(open('1.html'), 'lxml')
# 注意:默认打开文件的编码格式gbk所以需要指定打开编码格式

# 3定位节点:3个方法
# 返回第一个匹配的对象,可以加属性限制搜索结果
soup.find("img", id="", class_="")
# 返回匹配的所有对象,可以添加限制结果数量
soup.find_all(["img", "a"], limit=2)
# 返回匹配的所有对象结果
soup.select()
   # 1.element
       # eg:p
   # 2..class
       # eg:.firstname
   # 3.#id
       # eg:#firstname
   # 4.属性选择器
       # [attribute]
       # eg:li = soup.select('li[class]')
       # [attribute=value]
       # eg:li = soup.select('li[class="hengheng1"]')
   # 5.层级选择器
       # element element 后代
       # div p
       # element > element  父子节点关系
       # div>p
       # element,element
       # div,p
       # eg:soup = soup.select('a,span')
    
# 4获取节点内容
   # (1).获取节点内容:适用于标签中嵌套标签的结构
       # obj.string
       # obj.get_text()【推荐】
   # (2).节点的属性
       # tag.name 获取标签名
           # eg:tag = find('li)
           # print(tag.name)
   # tag.attrs将属性值作为一个字典返回
   # (3).获取节点属性
       # obj.attrs.get('title')【常用】
       # obj.get('title')
       # obj['title']

案例:爬取星巴克产品列表

from bs4 import BeautifulSoup
import urllib.request

url = "https://www.starbucks.com.cn/menu/"
response = urllib.request.urlopen(url)
content = response.read().decode("utf-8")

soup = BeautifulSoup(content, 'lxml')
# //ul[@class="grid padded-3 product"]//strong
name_list = soup.select("ul[class='grid padded-3 product'] strong")
for e in name_list:
    print(e.get_text())

最后:反反爬手段

1.请求头

1)UA识别

在请求头中添加User-Agent

headers = {
    'user‐agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, likeGecko) Chrome/74.0.3729.169 Safari/537.36'
}

2)cookie识别

'cookie': 'REALTIME_TRANS_SWITCH=1; FANYI_WORD_SWITCH..'

3)referer识别

判断当前路径是否从上一个路径来的。例如:图片的防盗连接。

2.IP识别

使用Handler代理定制请求

3.隐藏域识别

一般在网页源码中:先访问源码,提取出隐藏域数据

4.验证码识别

利用session保持会话

利用人工,或者第三方收费工具识别验证码

  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值