简单爬取网页数据

赫子菌

已于 2024-07-17 23:46:06 修改

阅读量108

点赞数 4

分类专栏： python 文章标签： python 网络爬虫

于 2024-07-16 22:21:33 首次发布

本文链接：https://blog.csdn.net/qq_63627848/article/details/140477910

版权

python 专栏收录该内容

3 篇文章 0 订阅

订阅专栏

首先定义一个获取网页文本的函数

import requests
def get_text(url):
    # 手动设置请求头中的 user-agent
    headers = {
        "user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36 Edg/126.0.0.0"
    }
    try:
        #记录访问信息
        print(f"正在访问{url}...")
        response = requests.get(url,headers=headers)
        #如果状态码不是2xx，则抛出异常
        response.raise_for_status()
        #根据适配编码响应
        response.encoding=response.apparent_encoding
        #返回网页信息
        return response.text
    except:
        #访问失败则返回空白
        print(f"访问{url}错误!")
        return ""

定义一个函数获取网页的bytes数据

def get_content(url):
    # 手动设置请求头中的 user-agent
    headers = {
        "user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36 Edg/126.0.0.0"
    }
    try:
        #记录访问信息
        print(f"正在访问{url}...")
        response = requests.get(url,headers=headers)
        #如果状态码不是2xx，则抛出异常
        response.raise_for_status()
        #根据适配编码响应
        response.encoding=response.apparent_encoding
        #返回网页信息
        return response.content
    except:
        #访问失败则返回空白
        print(f"访问{url}错误!")
        #返回一个二进制数据
        return b""

1.利用BeatifulSoup库获取对应信息

基本步骤

#BeautifulSoup => bs =>将html解析成BS对象

import bs4
import requests
import os
from download import get_text

# 创建存放图片的文件夹
img_path = "./douban_top_250_imgs"
if not os.path.exists(img_path):
    os.makedirs(img_path)

# 存放数据的位置
file = open("douban_top_250.csv","w",encoding="utf-8")
#访问网址
base_url = "https://book.douban.com/top250?start=0"

text = get_text(base_url)
#得到了一个html文本数据
#从html中提取需要的数据

#BeautifulSoup => bs =>将html解析成BS对象


if text:
    # 转化成bs4对象
    soup =bs4.BeautifulSoup(text,'lxml')
    #找到标签名为div，属性中id为content的对象，再寻找包括其中的table标签
    b_list = soup.find("div",attrs={"id":"content"}).find_all("table")
    print(b_list,len(b_list))


    for book in b_list:
        #书名数据
        book_title = book.find("div",attrs={"class":"pl2"}).a.get("title")
        #print(book_title)
        #书的具体信息数据
        b_info = book.find("p",attrs={"class":"pl"}).get_text()
        #print(b_info)
        #评分数据
        book_rate = book.find("span", attrs={"class":"rating_nums"}).get_text()
        #print(book_rate)
        #图片路径
        img_url =book.find("img").get("src")
        #print(img_url)
        # 数据存储
        file.write(f"{book_title},{b_info},{book_rate},\n")

        with open(f"{img_path}/{book_title}.jpg","wb") as f:
            f.write(download.get_content(img_url))

file.close()

2.利用lxml库将html解析成Element对象对元素进行查找

根据lxml查找的基本格式

# root_ele.xpath('//标签[@属性名="属性值"]')
# root_ele.xpath('//div[@class="mnav"]')
# root_ele.xpath('//div[@class]/li[@class="nav"]/@属性名')
# root_ele.xpath('//div[@class]/li[@class="nav"]/text()')

# /nodename : 从根节点开始选取子节点
# ./nodename : 从当前节点选取子节点
# //nodename :从根节点开始匹配子孙节点
# .//nodename :从当前节点开始匹配子孙节点


from lxml import html
import os
import download
from download import get_text

# 创建存放图片的文件夹
img_path = "./douban_top_250_xpath_imgs"
if not os.path.exists(img_path):
    os.makedirs(img_path)


file = open("douban_top_250_xpath.csv","w",encoding="utf-8")



# 获取10个网页数据
for i in range(10):
    base_url = f"https://book.douban.com/top250?start={i*25}"
    text = get_text(base_url)

    if text:
        root = html.fromstring(text)
        # 先找到所有书的table
        all_books = root.xpath("//table")

        for book in all_books:
            # 书名中的空格进行切割再重新拼接
            book_title1 = "".join(book.xpath(".//td[2]/div[@class='pl2']/a/text()")[0].split())
            book_title2 = "".join(book.xpath(".//td[2]/div[@class='pl2']/a/span/text()"))
            book_title =f"{book_title1}{book_title2}".replace(":","：")
            #print(book_title)
            book_info = book.xpath(".//td[2]/p[@class='pl']/text()")[0]
            #print(book_info)
            book_rating =book.xpath(".//td[2]/div[@class='star clearfix']/span[@class='rating_nums']/text()")[0]
            #print(book_rating)
            book_cover_url =book.xpath(".//img[@width='90']/@src")[0]
            #print(book_cover_url)
            path = os.path.join(img_path,book_title)
            #写入封面图片
            with open(path,"wb") as f:
                f.write(download.get_content(book_cover_url))
            #保存基本数据到csv文件中
            file.write(f"{book_title},{book_info},{book_rating}\n")

file.close()