python爬虫——使用代理和xpath爬取豆瓣读书

最新推荐文章于 2021-12-16 12:13:34 发布

kaijia323

最新推荐文章于 2021-12-16 12:13:34 发布

阅读量623

点赞数 1

分类专栏： python爬虫文章标签： python爬虫代理豆瓣读书多线程 os模块

本文链接：https://blog.csdn.net/weixin_44024393/article/details/89179373

版权

python爬虫专栏收录该内容

14 篇文章 8 订阅

订阅专栏

根据豆瓣读书的所有标签自动创建文件夹，使用代理防止被反爬。但是我的代理好像是假的，还是被反爬了…通过设置cookie爬取，但是爬取一定数量后需要在浏览器手动进行验证码输入…总的来说，代码写的很麻烦

import requests
from lxml import etree
from fake_useragent import UserAgent
import threading
import queue
import os
from urllib.parse import urljoin
import json
import csv
# 导入代理池
from proxies.get_proxy import get_ip_port
import random


# 获取ip
temp = list(map(lambda x: x[1], get_ip_port()))
temp = [x for x in temp if x]
# 构建请求头
ua = UserAgent()
headers = {
    'user-agent': ua.Chrome,
    'Cookie': '自己设置'
    }
# 创建锁
LOCK = threading.Lock()

# 创建全局队列
q_url = queue.Queue()
q_path = queue.Queue()


# 创建多线程
class MyThread(threading.Thread):
    def __init__(self):
        threading.Thread.__init__(self)
        # 声明一个列表存储字典
        self.data_list = []
        LOCK.acquire()
        self.path = q_path.get()
        self.url = q_url.get()
        self.flag = False
        LOCK.release()

    def run(self) -> None:
        self.parse_url(self.url, random.choice(temp))
        self.save_file()

    # 解析url，爬取数据
    def parse_url(self, temp_url, proxy):
        if not self.flag:
            # 设置代理
            proxies = {
                'https:': random.choice(temp)
            }
        else:
            proxies = proxy

        try:
            url = temp_url
            resp = requests.get(url, headers=headers, proxies=proxies, timeout=3)
            if resp.status_code == 200:

                self.flag = True

                html = etree.HTML(resp.content)
                lis = html.xpath('//ul[@class="subject-list"]/li')
                if lis:
                    for li in lis:
                        # 声明一个字典存储数据
                        data_dict = {}
                        # 题目
                        title = li.xpath('.//h2/a/text()')[0]
                        # 清洗数据
                        title = title.strip()
                        data_dict['title'] = title
                        # 作者、出版社、年份、价格
                        all_info = li.xpath('.//div[@class="pub"]/text()')[0]
                        # 清洗数据
                        all_info = all_info.strip().split('/')
                        # 判断作者有多少个，如果all_info的长度是4那么作者有一个，如果是5个那么作者是2个
                        if len(all_info) == 4:
                            # 作者
                            author = all_info[0]
                            # 出版社
                            pub_house = all_info[1]
                            # 日期
                            datetime = all_info[2]
                            price = all_info[3]

                            data_dict['author'] = author
                            data_dict['pub_house'] = pub_house.strip()
                            data_dict['datetime'] = datetime.strip()
                            data_dict['price'] = price.strip()
                        elif len(all_info) == 5:
                            # 作者
                            author = all_info[0] + '/' + all_info[1]
                            # 出版社
                            pub_house = all_info[2]
                            # 日期
                            datetime = all_info[3]
                            price = all_info[4]

                            data_dict['author'] = author
                            data_dict['pub_house'] = pub_house.strip()
                            data_dict['datetime'] = datetime.strip()
                            data_dict['price'] = price.strip()
                        # 评分
                        rating_nums = li.xpath('.//span[@class="rating_nums"]/text()')
                        if rating_nums:
                            rating_nums = rating_nums[0].strip()
                        # 评论数
                        comment_count = li.xpath('.//span[@class="pl"]/text()')[0]
                        # 清洗数据
                        comment_count = comment_count.strip().replace('(', '').replace('人评价)', '')
                        # 摘要
                        text = li.xpath('.//p/text()')
                        if text:
                            # 数据清洗
                            text = text[0].strip().replace('\r', '').replace('\n', '')
                        else:
                            text = None

                        data_dict['rating_nums'] = rating_nums
                        data_dict['comment_count'] = comment_count
                        data_dict['text'] = text

                        print(data_dict)
                        self.data_list.append(data_dict)

                next_url = html.xpath('//span[@class="next"]/a/@href')[0]
                # 拼接url
                if next_url:
                    next_url = urljoin(url, next_url)
                    print(next_url)
                    return self.parse_url(next_url, proxies)
            else:
                self.flag = False

                return self.parse_url(url, proxies)

        except Exception as e:
            LOCK.acquire()
            q_url.put(self.url)
            q_path.put(self.path)
            LOCK.release()

    # 将数据存入json文件和csv文件
    def save_file(self):

        if self.data_list:
            if not os.path.exists(self.path + '.json'):
                with open(self.path + '.json', 'a+', encoding='utf-8') as f:
                    json.dump(self.data_list, f, ensure_ascii=False, indent=4)
                print('json文件写入完成')
            if not os.path.exists(self.path + '.csv'):
                with open(self.path + '.csv', 'w', encoding='utf-8', newline='') as f:
                    # 表头
                    title = self.data_list[0].keys()
                    # 创建writer对象
                    writer = csv.writer(f)
                    # 写入表头
                    writer.writerow(title)
                    # 批量写入
                    for row in self.data_list:
                        writer.writerow(row.values())
                print('csv文件写入完成')


# 获取标签名字并新建文件夹
def get_tag():
    # 文件的基本路径
    path = r'E:\PycharmCode\dbds_spiders'
    # 设置代理
    proxies = {
        'https': random.choice(temp)
    }
    # 声明大标签
    big_tag_list = []
    # 获取标签名字的链接
    tag_url = 'https://book.douban.com/tag/?view=type&icn=index-sorttags-all'
    try:
        # 请求url
        resp = requests.get(tag_url, headers=headers, proxies=proxies, timeout=3)
        if resp.status_code is 200:
            # 将返回体转换成xpath的格式
            html = etree.HTML(resp.content)
            # 获取大标签的名字
            big_tag = html.xpath('//div[@class="article"]/div[2]/div')
            # 遍历大标签
            for tag in big_tag:
                # 获取标签名，原始数据是文学 · · · · · ·
                tag_name = tag.xpath('.//h2/text()')[0]
                # 数据清洗一下
                big_tag_name = tag_name.split('·')[0].strip()
                big_tag_list.append(big_tag_name)

                # 获取小标签的名字
                # 获取每个大标签下的小标签
                tds = tag.xpath('.//td')
                # 获取每个小标签的名字
                for td in tds:
                    min_tag_name = td.xpath('./a/text()')[0]
                    min_tag_url = td.xpath('./a/@href')[0]
                    min_tag_url = urljoin(tag_url, min_tag_url)
                    # 组合路径
                    file_path = path + '\\' + big_tag_name + '\\' + min_tag_name
                    # 将路径和url放入对于队列
                    q_url.put(min_tag_url)
                    q_path.put(file_path)

            return big_tag_list
        else:
            return get_tag()
    except Exception as e:
        return get_tag()


# 创建文件
def make_file(big_tag_list):
    # 文件的基本路径
    path = r'E:\PycharmCode\dbds_spiders'

    # 判断文件是否存在，如果不存在就新建文件夹
    # 新建文件夹路径
    for big_temp in big_tag_list:
        big_path = path + '\\' + big_temp
        # 新建文件夹
        if not os.path.exists(big_path):
            os.makedirs(big_path)

    print('文件夹新建完成')


def main():

    # 先将小标签的路径和url提取出来并放入队列
    big_tag_list = get_tag()
    make_file(big_tag_list)

    while not q_url.empty() and not q_path.empty():
        # print(q_url.get())
        # print(q_path.get())
        # print('++++++++++++++++++++++++')
        ths = []
        # 每次创建3个线程
        for i in range(3):
            t = MyThread()
            ths.append(t)
        for t in ths:
            t.start()
        for t in ths:
            t.join()


if __name__ == '__main__':

    main()

kaijia323

关注

1
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
python爬虫——使用代理和xpath爬取豆瓣读书

根据豆瓣读书的所有标签自动创建文件夹，使用代理防止被反爬。但是我的代理好像是假的，还是被反爬了…通过设置cookie爬取，但是爬取一定数量后需要在浏览器手动进行验证码输入…总的来说，代码写的很麻烦import requestsfrom lxml import etreefrom fake_useragent import UserAgentimport threadingimport q...
复制链接

扫一扫