python爬虫

最新推荐文章于 2024-09-18 20:39:31 发布

不良人龍木木

最新推荐文章于 2024-09-18 20:39:31 发布

阅读量550

点赞数 3

分类专栏： python爬虫文章标签： python 爬虫

本文链接：https://blog.csdn.net/weixin_47102187/article/details/138925057

版权

python爬虫专栏收录该内容

1 篇文章 0 订阅

订阅专栏

python 爬虫

1.爬虫基本框架

import requests
url = "https://cn.bing.com/search?q=%E7%99%BE%E5%BA%A6&form=ANSPH1&refig=6636054266074ec18daef89910da4de7&pc=CNNDDB"
ll = requests.get(url) # 从url获取网页返回的结果
ll.status_code #查看爬取是否成功，如果返回200表示成功，如果是400表示失败
ll.encoding #查看爬虫的url使用什么进行编码
ll.headers #查看爬取后的头部信息
ll.text #查看爬取后的内容

2.爬虫小说

import requests
from bs4 import BeautifulSoup


def get_chapter_content(url,encondingstype):
    r = requests.get(url)
    r.encoding=encondingstype
    soup = BeautifulSoup(r.text,"html.parser")
    return soup.find("main",id="content").get_text()
def main(url,out_path,encondingstype):
    ll = requests.get(url)
    if ll.status_code == 200:
        print(f"url可以打开")
        soup = BeautifulSoup(ll.text, "html.parser")
        nodes = soup.find_all("li", )
        co = 0
        for node in nodes:
            link = node.find("a", style="padding:8px 0;display:flex;")
            if link == None:
                continue
            co += 1
            new_url, title = url.replace("/chapterlist/22087153000372102", "") + "%s" % link["href"], link.get_text()
            print(f"正在爬取章节《{title}》")
            with open(out_path + "\\" + f"{co}_{title.replace(' ','')}.txt", "w") as f:
                f.write(get_chapter_content(new_url,encondingstype))
    else:
        print("爬取内容有反爬机制！")
if __name__ == '__main__':
    url = "https://www.xxsy.net/chapterlist/22087153000372102"
    out_path = r"E:\Destbook\笔记\技能\爬虫\案例\novels"
    c = requests.get(url)  # 从url获取网页返回的结果
    encondingstype = "utf-8"
    main(url,out_path,encondingstype)

3.爬虫美女图片

import os
import requests
from bs4 import BeautifulSoup


def GetCraw(url):
    ts = requests.get(url)
    if ts.status_code != 200:
        print(f"该{url}有反爬机制！")
        return
    ts.encoding = "gbk"
    con = BeautifulSoup(ts.text, "html.parser")
    imames = con.find_all("img")
    return imames

def DownImage(imames,out_path):
    for image in imames:
        if "uploads" not in image["src"]:
            continue
        image_name = os.path.basename(image["src"])
        src = f"{url.replace('/4kmeinv/', '')}{image['src']}"
        with open(out_path + "\\" + image_name, "wb") as f:
            resp_image = requests.get(src)
            f.write(resp_image.content)

def SinglePage(url,out_path):

    ts = requests.get(url)
    if ts.status_code != 200:
        print(f"该{url}有反爬机制！")
        return
    ts.encoding="gbk"
    con = BeautifulSoup(ts.text,"html.parser")
    imames = con.find_all("img")
    for image in imames:
        if "uploads" not in image["src"]:
            continue
        # print(image["src"])
        image_name = os.path.basename(image["src"])
        src = f"{url.replace('/4kmeinv/','')}{image['src']}"
        with open(out_path+"\\"+image_name,"wb") as f:
            resp_image = requests.get(src)
            f.write(resp_image.content)


def ManyPage(url,out_path):

    ManySrc = [url] + [url + f"index_{i}.html" for i in range(2,11) ]
    for i,urll in enumerate(ManySrc):
        images = GetCraw(urll)
        DownImage(images,out_path)
        print(f"第{i+1}页面已经爬取完成！")

if __name__ == '__main__':
    url = r"https://pic.netbian.com/4kmeinv/"
    out_path = r"E:\Destbook\笔记\技能\爬虫\案例\BeautalfulGirl"
    # SinglePage(url,out_path)  # 单页图片爬虫
    ManyPage(url,out_path) # 多页图片爬虫