python爬虫学习笔记

最新推荐文章于 2024-09-11 11:22:05 发布

年末的甲鱼

最新推荐文章于 2024-09-11 11:22:05 发布

阅读量987

点赞数

文章标签： python 爬虫学习

本文链接：https://blog.csdn.net/m0_48679741/article/details/131649209

版权

包的导入及使用

requests请求

requests.get/post(url,params,data,headers,timeout,verify,allow_redirects,cookies)

url：要下载的目标网页的URL
params：字典形式，设置URL后的参数，如：?id=123&name=turtle
data：字典或字符串，一般用于POST方式提交数据
headers：设置user_agent、refer等请求头
timeout：超时时间，单位s
verify：True/False，是否进行HTTPS证书验证，默认是，需要自己设置证书地址
allow_redirects：True/False是否让requests做重定向处理，默认是
cookies：附带本地cookies数据

接受 response 响应

r = requests.get/post(url)

//查看状态码，若200为成功
r.status_code
//可以查看当前编码，以及变更编码
//（requests会根据Headers推测编码，推测不到则设置为ISO-8859-1可能导致乱码）
r.ecoding
//查看返回的网页内容
r.text
//查看返回的HTTP的headers
r.headers
//查看实际访问的URL
r.url
//以字节方式返回内容，如用于下载图片
r.content
//服务端要写入本地的cookies数据
r.cookies

创建URL爬取管理器
url_manager.py文件

class UrlManager():
    """
    url管理器
    """

    # 初始化
    def __init__(self):
        # 创建为待爬取URL和已爬取URL集合
        self.new_urls = set()
        self.old_urls = set()

    # 添加待爬取URL
    def add_new_url(self, url):
        # 若URL不存在或为空则跳过
        if url is None or len(url) == 0:
            return
        # 若URL在待爬取URL及已爬取URL中则跳过
        if url in self.new_urls or url in self.old_urls:
            return
        # 否则将URL存入待爬取URL集合中
        self.new_urls.add(url)

    # 循环添加待爬取URL集合
    def add_new_urls(self, urls):
        # 若URL不存在或为空则跳过
        if urls is None or len(urls) == 0:
            return
        # 存在则for循环加入待爬取URL中
        for url in urls:
            self.add_new_url(url)

    # 获取正在爬取URL
    def get_url(self):
        # 若有待爬取URL
        if self.has_new_url():
            # 将URL从待爬取URL集合中移除
            url = self.new_urls.pop()
            # 将URL加入已爬取URL集合
            self.old_urls.add(url)
            # 返回爬取的URL
            return url
        # 若没有待爬取URL
        else:
            # 返回None
            return None

    # 判断是否有待爬取URL
    def has_new_url(self):
        # 判断是否有元素
        return len(self.new_urls) > 0


if __name__ == "__main__":
    url_manger = UrlManager()

    url_manger.add_new_url("url1")
    url_manger.add_new_urls(["url1", "url2"])
    print(url_manger.new_urls, url_manger.old_urls)
    print(url_manger.has_new_url())

    print("#"*30)
    new_url = url_manger.get_url()
    print(url_manger.new_urls, url_manger.old_urls)
    print(url_manger.has_new_url())

    print("#"*30)
    new_url = url_manger.get_url()
    print(url_manger.new_urls, url_manger.old_urls)
    print(url_manger.has_new_url())

Beautiful Soup 语法

创建BeautifulSoup对象

form bs4 import BeautifulSoup

# 根据HTML网页字符串创建BeautifulSoup对象
soup = BeautifulSoup(
                html_doc,              # HTML文档字符串
                'html.parser',         # HTML解析器
                from_encoding='utf8'  # HTML文档编码
                )

搜索节点（ find_all，find ）

方法：find_all(节点名称, 节点属性, 节点文本) # 返回查找到的所有
      find(节点名称, 节点属性, 节点文本)     # 返回查找到的第一个

# 查找所有标签为a的节点
soup.find_all('a')

# 查找所有标签为a，链接符合/view/123.html形式的节点
soup.find_all('a', href='/view/123.html')

# 查找所有标签为div，class为abc，文件为Python的节点
soup.find_all('div', class_='abc', string='Python')

访问节点信息

得到节点：<a href='1.html'>Python</a>

# 获取查找到到节点的标签名称
node.name        # 得到 a

# 获取查找到的a节点的herf属性
node['herf']     # 得到 1.html

# 获取查找到的a节点的链接文字
node.get_text()  # 得到 Python

例子：python/web_crawler/bs4_test/test.py

测试

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-7VQa36Ji-1688998591007)(爬取网站文章实例.png)]

爬取博客中的所有文章

import requests
from bs4 import BeautifulSoup
import re
from web_crawler.utils import url_manager

# 待爬取的网站地址
root_url = "http://www.crazyant.net"

# 初始化URL管理器
urls = url_manager.UrlManager()
# 将待爬取的网站地址加入URL管理器
urls.add_new_url(root_url)

# 打开爬取后数据写入的文件
fout = open("craw_all_pages.txt", "w")
# 若存在待爬取URL
while urls.has_new_url():
    # 获取正在爬取的URL
    curr_url = urls.get_url()
    # 根据URL爬取数据，并设置时间为3秒
    r = requests.get(curr_url, timeout=3)
    # 若爬取返回代码为200，则报错并跳过该URL
    if r.status_code != 200:
        print("error,return status_code is not 200", curr_url)
        continue

    # 创建BeautifulSou
    soup = BeautifulSoup(r.text, "html.parser")
    # 获取title
    title = soup.title.string

    # 将爬取的URL和title写入文件
    fout.write("%s\t%s\n" % (curr_url, title))
    # 刷新文件缓存
    fout.flush()
    # 打印写入文件的内容及剩余待爬取URL条数
    print("success: %s, %s, %d" % (curr_url, title, len(urls.new_urls)))

    # 获取所有标签为a的数据
    links = soup.find_all("a")
    # 循环加入待爬取集合中
    for link in links:
        # 获取标签为a中href的值
        href = link.get("href")
        # 若为空，则跳过
        if href is None:
            continue
        # 设置匹配模式
        pattern = r'^http://www.crazyant.net/\d+.html$'
        # 若与设置的匹配模式匹配，则加入待爬取URL集合
        if re.match(pattern, href):
            urls.add_new_url(href)
# 关闭文件
fout.close()

爬取排名前250的电影

# 使用pandas将数据写出到Excel
import pandas as pd
# 使用requests爬取网页
import requests
# 使用BeautifulSoup实现数据解析
from bs4 import BeautifulSoup
import pprint
import openpyxl
import json

# 构建分页数字列表
page_indexes = range(0, 250, 25)
list(page_indexes)


def download_all_html():
    """
    下载所有列表页面的HTML
    :return:得到的所有HTML
    """
    # 创建HTML集合
    htmls = []
    # 获取爬取页面每页的数据
    for idx in page_indexes:
        url = f"https://movie.douban.com/top250?start={idx}&filter="
        print("craw html:", url)
        # 设置用户代理（反爬取机制需使用）
        headers = {
            'User-Agent': '复制请求头中信息'
        }
        # 爬取的URL及将用户代理加入请求头
        r = requests.get(url, headers=headers)
        if r.status_code != 200:
            print(r.status_code)
            raise Exception("error")
        # 将爬取到的HTML加入集合
        htmls.append(r.text)
    return htmls


# 获取所有待爬取待HTML文件
htmls = download_all_html()


def parse_single_html(html):
    """
    解析单个HTML，得到数据
    :param html: 解析的HTML页面
    :return:
    """
    soup = BeautifulSoup(html, 'html.parser')
    # 获取所有项目
    article_items = (
        soup.find("div", class_="article")
        .find("ol", class_="grid_view")
        .find_all("div", class_="item")
    )
    datas = []
    # 循环从单个项目中获取数据
    for article_item in article_items:
        rank = article_item.find("div", class_="pic").find("em").get_text()
        info = article_item.find("div", class_="info")
        title = info.find("div", class_="hd").find("span", class_="title").get_text()
        stars = (
            info.find("div", class_="bd")
            .find("div", class_="star")
            .find_all("span")
        )
        rating_star = stars[0]["class"][0]
        rating_num = stars[1].get_text()
        comments = stars[3].get_text()

        # 添加获取的数据
        datas.append({
            "rank": rank,
            "title": title,
            "rating_star": rating_star.replace("rating", "").replace("-t", ""),
            "rating_num": rating_num,
            "comment": comments.replace("人评价", "")
        })
    # 返回获取的所有数据
    return datas


# 爬取单个网页中的数据
# pprint.pprint(parse_single_html(htmls[0]))

# 爬取全部网页中的数据
all_datas = []
for html in htmls:
    all_datas.extend(parse_single_html(html))
# 打印全部数据
# print(all_datas)
# 打印数据条数
# print(len(all_datas))

# 将获取的数据写入excel文件中
df = pd.DataFrame(all_datas)
df.to_excel("film_crawler.xlsx")

爬取广州10年天气数据

# headers中设置user agent翻爬机制
# 通过network抓包，分析ajax的请求和参数
# 通过for循环请求不同的参数数据
# 利用pandas实现excel的合并与保存

import requests
import pandas as pd

# 爬取url
url = "http://tianqi.2345.com/Pc/GetHistory"

# 设置请求头，绕开反爬机制
headers = {
    "User-Agent": """复制请求头中信息"""
}


def craw_table(year, month):
    """
    根据对应的年份和月份爬取对应的表格数据
    :param year: 爬取的年份
    :param month: 爬取的月份
    :return: 爬取的表格数据
    """
    # 爬取数据请求头中所携带参数
    params = {
        "areaInfo[areaId]": 59287,
        "areaInfo[areaType]": 2,
        "date[year]": year,
        "date[month]": month
    }
    resp = requests.get(url, headers=headers, params=params)

    if resp.status_code != 200:
        pass
    else:
        # 返回数据为json类型，提取data部分数据
        data = resp.json()["data"]
        # 获取天气数据
        df = pd.read_html(data)[0]
        return df


# 设空list存储天气数据
df_list = []
# 检索2012-2022年数据
for year in range(2012, 2023):
    # 检索每年中各月份数据
    for month in range(1, 13):
        print("爬取：", year, month)
        # 将爬取的数据放入df
        df = craw_table(year, month)
        # 将df加入list
        df_list.append(df)

# print(df_list)
# 将爬取到的数据合并，并写入excel表格中
pd.concat(df_list).to_excel("GuangZhouTemperature_10year.xlsx", index=False)

爬取全本小说

import requests
from bs4 import BeautifulSoup
import time

# 爬取的url
root_url = "https://www.kunnu.com/santi/"
# 设置请求头，绕开反爬机制
headers = {
    "": "复制请求头中信息"
}


def get_novel_chapters():
    """
    爬取小说每章的链接
    :return: 每章的链接+章节名
    """
    r = requests.get(root_url, headers=headers)
    if r.status_code != 200:
        print(r.status_code)
        return
    soup = BeautifulSoup(r.text, "html.parser")
    # 爬取 class=chapter 的 div 模块数据
    chapter = soup.find("div", class_="book-list clearfix")
    # print(chapter)

    # 设置空 list 存储章节链接和章节名
    data = []
    # 爬取 class=chapter 的 div 中的 li 中数据
    for li in chapter.find_all("li"):
        # li 中超链接设为 link
        link = li.find("a")
        # 若link为空，则跳过
        if not link:
            continue

        # print(link["href"])
        # 将爬取到的超链接补充完整，并与内容一起存入 data 中
        # data.append(("https://www.xiaoshuowanben.com%s" % link["href"], link.get_text()))
        data.append((link["href"], link.get_text()))
        # time.sleep(0.5)
    # 返回 data 列表
    return data


def get_chapter_content(url):
    """
    爬取每章节的内容
    :param url: 爬取的章节url
    :return: 章节内容
    """
    r = requests.get(url, headers=headers)
    soup = BeautifulSoup(r.text, "html.parser")
    return soup.find("div", id="nr1").get_text()


# 将获取的章节循环爬取
for chapter in get_novel_chapters():
    # 将 chapter 分为 url+title
    url, title = chapter
    print(url, title)
    # 将爬取的内容写入文件中，a 为找到文件后在文件后中加入，w 为找到文件后删除文件内容重写内容
    with open("三体1.txt", "a") as f:
        # 写入内容
        f.write(title + '\n')
        f.write(get_chapter_content(url))

修改

import requests
from bs4 import BeautifulSoup
import time

# 爬取的url
root_url = "https://www.kunnu.com/santi/"
# 设置请求头，绕开反爬机制
headers = {
    "User-Agent": "复制请求头中信息"
}


def get_novel_chapters():
    """
    爬取小说每章的链接
    :return: 每章的链接+章节名
    """
    r = requests.get(root_url, headers=headers)
    if r.status_code != 200:
        print(r.status_code)
        return
    soup = BeautifulSoup(r.text, "html.parser")
    # 爬取 class=chapter 的 div 模块数据
    book = soup.find_all("div", class_="book-list clearfix")

    # 设置空 list 存储章节链接和章节名
    data = []
    for chapter in book:
        # 爬取 class=chapter 的 div 中的 li 中数据
        for li in chapter.find_all("li"):
            # li 中超链接设为 link
            link = li.find("a")
            # 若link为空，则跳过
            if not link:
                continue

            # 将爬取到的超链接补充完整，并与内容一起存入 data 中
            # data.append(("https://www.xiaoshuowanben.com%s" % link["href"], link.get_text()))
            data.append((link["href"], link.get_text()))
            # time.sleep(0.5)
    # 返回 data 列表
    return data


def get_chapter_content(url):
    """
    爬取每章节的内容
    :param url: 爬取的章节url
    :return: 章节内容
    """
    r = requests.get(url, headers=headers)
    soup = BeautifulSoup(r.text, "html.parser")
    temp = soup.find("div", id="nr1")
    if temp is not None:
        text = temp.get_text()
        return text
    return


# 将获取的章节循环爬取
for chapter in get_novel_chapters():
    # 将 chapter 分为 url+title
    url, title = chapter
    print(url, title)
    # 将爬取的内容写入文件中，a 为找到文件后在文件后中加入，w 为找到文件后删除文件内容重写内容
    if chapter is not None:
        with open("三体完结.txt", "a") as f:
            # 写入内容
            f.write(title + '\n\n')
            f.write(get_chapter_content(url))
            f.write('\n')

改进爬取过程

# 只需要打开一次文件，即可进行写入操作
with open("三体完结.txt", "w") as f:
    # 将获取的章节循环爬取
    for chapter in get_novel_chapters():
        # 将 chapter 分为 url+title
        url, title = chapter
        print(url, title)
        # 将爬取的内容写入文件中，a 为找到文件后在文件后中加入，w 为找到文件后删除文件内容重写内容
        if chapter is not None:
            # 写入内容
            f.write(title + '\n\n')
            f.write(get_chapter_content(url))
            f.write('\n')

爬取图片

import requests
from bs4 import BeautifulSoup
import os

# 爬取网站url
url = "https://pic.netbian.com/e/search/result/?searchid=343"

resp = requests.get(url)
# 设置编码类型为 gbk
resp.encoding = 'gbk'

soup = BeautifulSoup(resp.text, "html.parser")

# 找到 class=clearfix 的 ul
ul = soup.find("ul", class_="clearfix")
# 检索 ul 中的所有图片
for img in ul.find_all("img"):
    # 将检索到的路径拼接，形成新的url
    src = f"https://pic.netbian.com{img['src']}"

    # 将 src 的最后一个字符串作为下载的图片名
    filename = os.path.basename(src)
    # 将图片写入 cat_photo 文件下，w 为写入，b 为以二进制形式进行写入操作
    with open(f"cat_photo/{filename}", "wb") as f:
        # 爬取图片url的数据
        resp_img = requests.get(src)
        # 将数据写入
        f.write(resp_img.content)

封装

import requests
from bs4 import BeautifulSoup
import os

# 爬取网站url
url = "https://pic.netbian.com/e/search/result/?searchid=343"


def craw_html(url):
    resp = requests.get(url)
    # 设置编码类型为 gbk
    resp.encoding = 'gbk'
    html = resp.text
    return html


def parse_and_download(html):
    soup = BeautifulSoup(html, "html.parser")

    # 找到 class=clearfix 的 ul
    ul = soup.find("ul", class_="clearfix")
    # 检索 ul 中的所有图片
    for img in ul.find_all("img"):
        # 将检索到的路径拼接，形成新的url
        src = f"https://pic.netbian.com{img['src']}"

        # 将 src 的最后一个字符串作为下载的图片名
        filename = os.path.basename(src)
        # 将图片写入 cat_photo 文件下，w 为写入，b 为以二进制形式进行写入操作
        with open(f"cat_photo/{filename}", "wb") as f:
            # 爬取图片url的数据
            resp_img = requests.get(src)
            # 将数据写入
            f.write(resp_img.content)


html = craw_html(url)
parse_and_download(html)