python——爬虫

最新推荐文章于 2024-02-23 11:39:34 发布

wy313622821

最新推荐文章于 2024-02-23 11:39:34 发布

阅读量492

点赞数 1

分类专栏： python

原文链接：https://blog.csdn.net/c406495762/article/details/78123502

版权

python 专栏收录该内容

16 篇文章 0 订阅

订阅专栏

1、爬取小说

1、获取指定URL的xml内容
2、使用正则表达式、Xpath、Beautiful Soup等方式获取感兴趣的内容

Beautiful Soup：
安装

pip install beautifulsoup4
或者使用：
easy_install beautifulsoup4

Beautiful Soup的中文官方文档：http://beautifulsoup.readthedocs.io/zh_CN/latest/

我们想要获取的元素内容全部都在 div标签（<div id=“content”, class=“showtxt”>）里面，所以我们代码可以这么写

from bs4 import BeautifulSoup
import requests
if __name__ == "__main__":
     target = 'http://www.biqukan.com/1_1094/5403177.html'
     req = requests.get(url = target)
     html = req.text
     bf = BeautifulSoup(html)
     texts = bf.find_all('div', class_ = 'showtxt')
     print(texts)

使用text属性，提取文本内容，滤除br标签，去除div标签名，br标签，以及各种空格。replace(’\xa0’*8,’\n\n’)就是去掉下图的八个空格符号，并用回车代替
在这里插入图片描述

获取小说详细内容最终代码

from bs4 import BeautifulSoup
import requests
if __name__ == "__main__":
     target = 'http://www.biqukan.com/1_1094/5403177.html'
     req = requests.get(url = target)
     html = req.text
     bf = BeautifulSoup(html)
     texts = bf.find_all('div', class_ = 'showtxt')
     # print(texts)
     print(texts[0].text.replace('\xa0' * 8, '\n\n'))

得到了我们想要的内容
在这里插入图片描述

要想下载正本小说，我们就要获取每个章节的链接。我们先分析下小说目录：
URL：http://www.biqukan.com/1_1094/
在这里插入图片描述
通过审查元素，我们发现可以发现，这些章节都存放在了class属性为listmain的div标签下
我们来看第一章的元素

不难发现，标签中href属性存放的属性值/1_1094/5403177.html是章节URLhttp://www.biqukan.com/1_1094/5403177.html的后半部分。其他章节也是如此！那这样，我们就可以根据标签的href属性值获得每个章节的链接和名称了。

def getxml_xiaoshuo_zhangjie():    
    target = 'http://www.biqukan.com/1_1094/'
    req = requests.get(url=target)
    #req.encoding = 'utf-8'
    #req.encoding = 'GB2312'
    req.encoding = 'GB18030'
    html = req.text
    div_bf = BeautifulSoup(html)
    div = div_bf.find_all('div', class_='listmain')
    # print(div[0])
    print(div)

获取到的值为
在这里插入图片描述
使用a.get(‘href’)方法就能获取href的属性值：/1_1094/5403177.html

获取小说章节链接最终代码

def getxml_xiaoshuo_zhangjie():
    server = 'http://www.biqukan.com/'
    target = 'http://www.biqukan.com/1_1094/'
    req = requests.get(url=target)
    req.encoding = 'GB18030'
    html = req.text
    div_bf = BeautifulSoup(html)
    div = div_bf.find_all('div', class_='listmain')
    # print(div[0])
    a_bf = BeautifulSoup(str(div[0]))
    a = a_bf.find_all('a')
    # print(a)
    for each in a:
        print(each.string, server + each.get('href'))

整合后的全部代码为：

注意：需要在D盘新建一个文件夹xiaoshuo

import sys
from bs4 import BeautifulSoup
import requests
import os
import time
class downloader(object):

    def __init__(self):
        self.server = 'http://www.biqukan.com/'
        self.target = 'http://www.biqukan.com/1_1094/'
        self.names = []            #存放章节名
        self.urls = []            #存放章节链接
        self.nums = 0            #章节数

    """
    1、函数说明:获取下载章节的链接    
    """
    def get_download_url(self):
        req = requests.get(url = self.target)
        req.encoding = 'GB18030'
        html = req.text
        div_bf = BeautifulSoup(html, "html.parser")#使用BeautifulSoup解析器
        div = div_bf.find_all('div', class_ = 'listmain')
        a_bf = BeautifulSoup(str(div[0]), "html.parser")
        a = a_bf.find_all('a')
        self.nums = len(a[13:])#剔除不必要的章节（去掉前面13个），并统计章节数
        for each in a[13:]:
            self.names.append(each.string)
            self.urls.append(self.server + each.get('href'))
        # print(self.names)
        # print(self.urls)
    """
    2、函数说明:获取章节内容
    Parameters:
        target - 下载连接(string)
    Returns:
        texts - 章节内容(string)
    Modify:
        2017-09-13
    """
    def get_contents(self, target):
        req = requests.get(url = target)
        html = req.text
        bf = BeautifulSoup(html, "html.parser")#使用BeautifulSoup解析器
        #  获取div标签（<div id="content", class="showtxt">）里面的全部内容
        texts = bf.find_all('div', class_ = 'showtxt')
        # 去掉空格（&nbsp），用回车代替
        texts = texts[0].text.replace('\xa0'*8,'\n\n')
        # print(texts)
        return texts

    """
    3、函数说明:将爬取的文章内容写入文件
    Parameters:
        name - 章节名称(string)
        path - 当前路径下,小说保存名称(string)
        text - 章节内容(string)
    Returns:
        无
    Modify:
        2017-09-13
    """
    def writer(self, name, path, text):
        write_flag = True
        with open(path, 'a', encoding='utf-8') as f:
            f.write(name + '\n')
            f.writelines(text)
            f.write('\n\n')

        # file_handle = open('D:\\xiaoshuo\\1.txt', mode='w')
        # file_handle.write('hello word 你好 \n')

if __name__ == "__main__":
    dl = downloader()
    # dl.writer("","","")

    dl.get_download_url()
    print('《一年永恒》开始下载：')
    print('总共：',dl.nums,"章节")
    # dl.writer(dl.names[1], 'D:\\xiaoshuo\\1.txt', dl.get_contents(dl.urls[1]))

    for i in range(dl.nums):
        dl.writer(dl.names[i], 'D:\\xiaoshuo\\一念永恒.txt', dl.get_contents(dl.urls[i]))
        sys.stdout.write("  已下载:%.3f%%" %  float(i/dl.nums) + '\r')
        sys.stdout.flush()
    print('《一年永恒》下载完成')

2、爬取动态网站图片

a、爬取百度图片

import os,stat
import urllib.request
import re
import time

def open_url(url):
    req = urllib.request.Request(url)
    req.add_header('User-Agent',
                   'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360EE')
    page = urllib.request.urlopen(req)  # 打开网页
    html = page.read().decode('utf-8')  # 网页解码
    # print(html)
    return html

def get_img(html):
    p = r'"(https:[^"]+\.jpg)'  # 正则匹配的字段
    imglist = re.findall(p, html)
    # print(imglist)

    # ***************去除数组重复的项** begin*********
    if len(imglist) == 0:
        print(0)
    j = 0
    for i in range(len(imglist)):
        if imglist[i] != imglist[j]:
            imglist[j + 1] = imglist[i]
            j += 1
    # print(imglist[:j + 1])
    # ***************去除数组重复的项** end*********
    for each in imglist:
        print("图片url地址为：", each)
        filename = each.split("/")[-1].split(".")[0]
        download_img(each,"D:\\image",filename)

    # for each in imglist:
    #     print("图片url地址为：", each)
    #     filename = each.split("/")[-1]
    #     urllib.request.urlretrieve(each, filename, None)  # 保存图片

        # time.sleep(0.01)

#  使用url地址下载图片到D盘
# 使用示例：download_img("https://ss1.bdstatic.com/70cFvXSh_Q1YnxGkpoWK1HF6hhy/it/u=1176216821,2473090430&fm=26&gp=0.jpg","D:\\image","1")
def download_img(img_url,file_path,file_name):
    try:
        # 是否有这个路径
        if not os.path.exists(file_path):
            # 创建路径
            os.makedirs(file_path)
            # 获得图片后缀
        file_suffix = os.path.splitext(img_url)[1]
        print(file_suffix)
        # 拼接图片名（包含路径）
        filename = '{}{}{}{}'.format(file_path, os.sep, file_name, file_suffix)
        # print(filename)
        # 下载图片，并保存到文件夹中
        urllib.request.urlretrieve(img_url, filename=filename)

    except IOError as e:
        print("IOError")
    except Exception as e:
        print("Exception")
    pass

if __name__ == '__main__':
    # 网址
    url = 'https://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&fmq=1606186050936_R&pv=&ic=&nc=1&z=&hd=&latest=&copyright=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&hs=2&sid=&word=%E7%BE%8E%E5%A5%B3'
    get_img(open_url(url))

b、爬取百度图片（跟a的方式不一样）

import urllib
import json
import requests
import os
import time


def get_urlset(keyword, page):  # 获得网址链接集合
    url_list = []
    kw = urllib.parse.quote(keyword)  # 关键字（车）转成 %E8%BD%A6 的这种字符
    print('kw=', kw)
    for j in range(page):
        p = j * 30
        url_list.append(
            "https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord=" + kw + "&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=0&hd=&latest=&copyright=&word=" + kw + "&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&fr=&expermode=&force=&pn=" + str(
                p) + "&rn=30&gsm=b4")
    print("图片列表数组", url_list)
    return url_list


def get_html(url):  # 获得网址内容
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}
    proxies = {'http': '117.91.250.131:9999'}
    try:
        resp = requests.get(url, headers=headers)
    except:
        resp = requests.get(url, headers=headers, proxies=proxies)
    return resp


# text=get_html(get_urlset("湖北中医药大学")[0].text)

def parse_json(text):  # 解析json获得图片地址
    img_url = []
    try:
        result = json.loads(text)
        if result:
            for i in result.get("data"):
                img_url.append(i.get("hoverURL"))
        return img_url
    except:
        print("获取图片地址失败")


# print(parse_json(text) )
# 建一个文件夹，如果不存在就新建
def mkdir(path):
    # os.path.exists(name)判断是否存在路径
    # os.path.join(path, name)连接目录与文件名
    isExists = os.path.exists(os.path.join("D:/image", path))
    if not isExists:
        print('makedir', path)
        os.makedirs(os.path.join("D:/image", path))
        os.chdir(os.path.join("D:/image", path))
        return True
    else:
        print(path, 'already exists')
        return False


def open_img(img, Number):  # 打开并保存图片到文件中
    if img:
        filename = keyword + str(Number) + ".jpg"
        with open(r"D:/image/" + keyword + r'/' + filename, 'wb') as f:
            try:
                f.write(get_html(img).content)
                print(filename + "下载成功")
            except:
                print("读取文件失败")


def main(keyword, page):  # 主函数
    mkdir(keyword)
    urlset = get_urlset(keyword, page)
    Number = 0  # 用来计数
    for url in urlset:
        Text = get_html(url).text
        img_url = parse_json(Text)
        if img_url:
            for img in img_url:
                open_img(img, Number)
                Number += 1


if __name__ == '__main__':
    # 计时
    t1 = time.time()
    keyword = input("请输入你要爬取的图片:")
    page = input("请输入你要爬取的页数:")
    page = int(page)
    main(keyword, page)
    print("=耗费的总能时间为=", time.time() - t1)

本文转载：https://blog.csdn.net/c406495762/article/details/78123502
在上面的基础上进行修改，如侵权请联系删

wy313622821

关注

1
点赞
踩
1

收藏

觉得还不错? 一键收藏
1
评论
python——爬虫

1、获取指定URL的xml内容2、使用正则表达式、Xpath、Beautiful Soup等方式获取感兴趣的内容Beautiful Soup：安装pip install beautifulsoup4或者使用：easy_install beautifulsoup4Beautiful Soup的中文官方文档：http://beautifulsoup.readthedocs.io/zh_CN/latest/我们想要获取的元素内容全部都在 div标签（<div id=“content”,
复制链接

扫一扫

专栏目录