python爬虫

Sermiscry

已于 2023-03-08 18:36:07 修改

阅读量124

点赞数

分类专栏： python 文章标签： python

于 2023-03-03 10:53:32 首次发布

本文链接：https://blog.csdn.net/Sermisry/article/details/129315841

版权

python 专栏收录该内容

2 篇文章 0 订阅

订阅专栏

文章目录

re模块（[正则表达式](https://blog.csdn.net/Sermisry/article/details/129269111?spm=1001.2014.3001.5501)）
- 图片
- movie
bs4
xpath

re模块（正则表达式）

import re

# findall：匹配字符串中所有的符合正则的内容
lst=re.findall("\d+", "我的电话号是：10086，我女朋友的电话号是：10010")
# print(lst)

# finditer：匹配字符串中所有的内容(返回的是迭代器)，从迭代器中拿到内容要.group
it = re.finditer("\d+", "我的电话号是：10086，我女朋友的电话号是：10010")
# for i in it:
    # print(i.group())

# search找到一个结果就返回，返回的结果是match对象，拿数据要.group()
s = re.search("\d+", "我的电话号是：10086，我女朋友的电话号是：10010")
# print(s.group())  # 输出 "10086"

# match是从头开始匹配，
s = re.match("\d+", "我的电话号是：10086，我女朋友的电话号是：10010")
# print(s.group())  匹配不到

# 预加载正则表达式
obj=re.compile(r"\d+")

ret = obj.finditer("我的电话号是：10086，我女朋友的电话号是：10010")
# print(ret)
# for it in ret:
#     print(it.group())

# (?P<分组名字>正则) 可以单独从正则匹配的内容中进一步提取内容

图片

# 如何爪巴取图片数据
import requests

if __name__ == '__main__':
    url = 'https://www.liulangla.cn/public/uploads/images/20230227/3624_20230227121446b8185.jpeg'

    # content返回的是二进制形式的图片数据
    # text（字符串） content（二进制） json() （对象）
    img_data = requests.get(url=url).content

    with open('./dog.jpg', 'wb') as fp:
        fp.write(img_data)

movie

import requests
import re

domain="https://www.dytt89.com/"
response = requests.get(domain) # 去掉安全验证verify=False
response.encoding='gb2312' #指定字符集
# print(response.text)

# 拿到ul里面的li
obj1 = re.compile(r'2023必看热片.*?<ul>(?P<ul>.*?)</ul>', re.S)
obj2 = re.compile(r"<a href='(?P<href>.*?)'", re.S)
obj3 = re.compile(r'◎片　　名(?P<movie>.*?)<br />.*?<td style="WORD-WRAP: break-word" bgcolor="#fdfddf"><a href="(?P<download>.*?)">', re.S)

result1=obj1.finditer(response.text)
child_href_list=[]
for it in result1:
    ul = it.group('ul')
    # print(ul)
    #提取子页面连接
    result2 = obj2.finditer(ul)
    for itt in result2:
        # 拼接子页面的url地址
        child_href = domain+itt.group('href').strip('/')
        # print(itt.group('href'))
        child_href_list.append(child_href)# 吧子页面链接保存起来

# 提取子页面内容
for href in child_href_list:
    child_res = requests.get(href)
    child_res.encoding='gb2312'
    # print(child_res.text)
    result3 = obj3.search(child_res.text)
    print(result3.group("movie"))
    print(result3.group("download"))

bs4

拿到页面源代码，提取子页面的链接地址 href
通过href拿到子页面的内容，从子页面中找到图片的下载地址 img->src
下载图片

import requests
from bs4 import BeautifulSoup
import time

url='https://www.umei.cc/bizhitupian/weimeibizhi/'
response = requests.get(url)
response.encoding = 'utf-8'
# print(response.text)
# 把页面中的源代码交给BeautifulSoup进行处理，生成bs对象
main_page=BeautifulSoup(response.text, 'html.parser')
alist = main_page.find('div', class_="item_list infinite_scroll").find_all('a') 
# print(alist)
for a in alist:
    print(a.get('href'))# 直接通过get拿到属性的值
    href = 'https://www.umei.cc'+a.get('href')
    # 拿到子页面的源代码
    child_page_resp = requests.get(url=href)
    child_page_resp.encoding='utf-8'
    child_page_text = child_page_resp.text
    # 从子页面中拿到图片的下载路径
    child_page = BeautifulSoup(child_page_text, "html.parser")
    p=child_page.find("div", class_="big-pic")
    img=p.find("img")
    src=img.get("src")
    with open('高清壁纸.txt', 'a') as f:
        f.write(src+'\n')
        print(src)
    f.close()
    time.sleep(1)


    # print(src)
    
    # 下载图片
    # img_response = requests(src)
    # img_response.content # 拿到了字节
    # img_name = src.split("/")[-1] # 拿到了url中最后一个/以后的内容
    # with open("img/"+img_name, mode='wb') as f:
    #     f.write(img_response.content) # 图片内容写入文件
    
    #     print("over", img_name)
    #     time.sleep(1000)

xpath

xpath是在XML文档中搜索内容的一门语言
html是xml的一个子集
安装lxml模块 pip install lxml
xpath解析

from lxml import etree

tree1 = etree.XML(xml)
tree.xpath('/book/name/text()') # /表示层级关系,第一个/是根节点,text()是拿到文本
tree.xpath('/book/name/auther//nick') # //表示后代
tree.xpath('/book/name/auther/*/nick') # *表示任意的节点,通配符

tree2 = etree.prase('b.html')
tree2.xpath('/html/body/ul/li[1]/a/text()') # 找a标签里面的文本内容
tree2.xpath("/html/body/ol/li/a[@href='dapao']/text()") # 找a标签里面herf为dapao的文本

ol_list_li = tree2.xpath("/html/body/ol/li")# 找ol里面的list
for li in ol_list_li:
    print(li)
    # 从每一个li中提取到文字信息
    result1 = li.xpath("./a/text()") # 在li中继续去寻找，相对查找
    
    result2 = li.xpath("./a/@href") # 找a标签里面的href的值

tree2.xpath("html/body/ul/li/a/@href")
//*[@id="6"]/div/div[2]/a[5]
/html/body/div[2]/div[4]/div[1]/div[3]/div[6]/div/div[2]/a[5]