python爬虫二（框架）

y_w_x_k

已于 2023-06-06 22:16:09 修改

阅读量1.2w

点赞数

文章标签： python 爬虫开发语言

于 2023-05-16 11:25:06 首次发布

本文链接：https://blog.csdn.net/y_w_x_k/article/details/130701133

版权

5.selenium模块

selenium是基于浏览器自动化的一个模块，类似脚本模拟人的操作

selenium可以便捷的获取动态加载的数据，可以便捷的实现模拟登录；

selenium直接获取ajax动态加载的数据：

from selenium import webdriver
from lxml import etree

driver = webdriver.Chrome(executable_path="./chromedriver")

url = "https://www.pearvideo.com/video_1782544"

driver.get(url)

# 获取网页源代码
html = driver.page_source

etree_html = etree.HTML(html)

video = etree_html.xpath("//div[@id='drag_target1']//video/@src")[0]

print(video)

selenium自动化操作

from selenium import webdriver
from selenium.webdriver.common.by import By
import time

# 加载 浏览器驱动
driver = webdriver.Chrome(executable_path="./chromedriver")
url = "https://www.taobao.com/"
driver.get(url)

# 获取搜索框
input = driver.find_element(by=By.ID, value="q")

# # 输入搜索内容
input.send_keys("西屋浴霸")
# # 获取搜索按钮
button = driver.find_element(by=By.CLASS_NAME, value="btn-search")

# 页面滚动
driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")

# 页面暂停3秒
time.sleep(2)

# # 点击搜索按钮
button.click()

# 页面跳转
driver.get("http://www.baidu.com")

#页面返回
driver.back()
#页面前进
driver.forward()

进入iframe与动作链

import time
from selenium import webdriver
from lxml import etree
from selenium.webdriver.common.by import By
# 导入动作链
from selenium.webdriver import ActionChains
# 加载 浏览器驱动
driver = webdriver.Chrome(executable_path="./chromedriver")

url = "https://www.runoob.com/try/try.php?filename=jqueryui-api-droppable"

driver.get(url)

# 跳转到iframe
driver.switch_to.frame("iframeResult")

# 导入动作链
action = ActionChains(driver)
# 获取div元素
div = driver.find_element(by=By.ID, value="draggable")

# 动作链点击并长按div
action.click_and_hold(div)

# 动作链拖拽
for i in range(5):
    # perform()方法立即执行 20,0表示x轴移动20个像素，y轴移动0个像素
    action.move_by_offset(50, 0).perform()
    time.sleep(0.3)
# 释放动作链
action.release()

# 关闭浏览器
driver.quit()

selenium隐形浏览器与规避检测

# 导入selenium
from selenium import webdriver
# 导入time
import time
# 导入By
from selenium.webdriver.common.by import By
# 导入无可视化界面包
from selenium.webdriver.chrome.options import Options

# 导入规避检测包
from selenium.webdriver import ChromeOptions

# 创建无可视化界面操作，无头浏览器
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')

# 实现规避检测
option = ChromeOptions()
option.add_experimental_option('excludeSwitches', ['enable-automation'])

# 加载浏览器驱动
driver = webdriver.Chrome(
    executable_path="./chromedriver", chrome_options=chrome_options, options=option)

url = "https://www.baidu.com/"

driver.get(url)

# 获取页面
html = driver.page_source
print(html)
time.sleep(2)
# 关闭浏览器
driver.quit()

selenium截图、裁剪

# 导入selenium
from selenium import webdriver
# 导入time
import time
# 导入By
from selenium.webdriver.common.by import By
# 导入裁剪包
from PIL import Image

# 加载浏览器驱动
driver = webdriver.Chrome(
    executable_path="./chromedriver")

url = "https://www.baidu.com/"

# 打开网页,网页全屏
driver.get(url)
driver.maximize_window()

time.sleep(2)

# 截取屏幕
driver.save_screenshot("./baidu.png")

# 获取lg元素
lg = driver.find_element(by=By.CLASS_NAME, value="s_form_wrapper")

# 获取元素左上角坐标
location = lg.location

# 获取元素宽高
size = lg.size
print(location, size)

# 电脑屏幕分辨率必须调到100%才能截取正确的图片
# 左上角到右下角坐标
rangle = (
    int(location['x']), int(location['y']), int(location['x']+size['width']), int(location['y']+size['height']))

# 用裁剪包打开图片
i = Image.open("./baidu.png")
# 裁剪图片
frame = i.crop(rangle)
# 保存图片
frame.save("./baidu_cut.png")

模拟zhihu登录

# 模拟知乎登录

# 导入Chaojiying_Client模块
from chaojiying.chaojiying import Chaojiying_Client

# 导入selenium模块
from selenium import webdriver
# 导入time模块
import time
# 导入By模块
from selenium.webdriver.common.by import By

import requests

# 导入selenium动作模块
from selenium.webdriver.common.action_chains import ActionChains

from PIL import Image

options = webdriver.ChromeOptions()
# 防止程序执行完浏览器自动关闭
options.add_experimental_option('detach', True)

# 加载浏览器驱动
driver = webdriver.Chrome(executable_path="./chromedriver", options=options)

url = "https://www.zhihu.com/signin?next=%2Fhot"

# 打开网页
driver.get(url)

time.sleep(1)

# 点击密码登录
tabs = driver.find_elements(by=By.CLASS_NAME, value="SignFlow-tab")
tabs[1].click()


# # 找到用户名输入框
username_inputs = driver.find_elements(
    by=By.CLASS_NAME, value="username-input")
username_inputs[0].send_keys("15257741312")
username_inputs[1].send_keys("p64821120")

# # 找到登录按钮
btn = driver.find_element(
    by=By.CLASS_NAME, value="SignFlow-submitButton")

# # 点击按钮
btn.click()

time.sleep(1)
# 跳转iframe
# driver.switch_to.frame(0)

# time.sleep(5000)
# 获取图片
img = driver.find_element(By.CLASS_NAME, "yidun_bg-img")

# 获取图片位置
location = img.location
# 获取图片大小
size = img.size
# 获取图片左上角坐标
rangle = (
    int(location['x']), int(location['y']), int(location['x']+size['width']), int(location['y']+size['height']))

time.sleep(1)
# 获取滑块元素
slider_move = driver.find_element(By.CLASS_NAME, "yidun_slider")
print(slider_move.size)
# 获取滑块的宽度
slider_width = slider_move.size['width']
print(slider_width)

# 截图
driver.save_screenshot("./drag1.png")

# 用裁剪包打开图片
i = Image.open("./drag1.png")
# 裁剪图片
frame = i.crop(rangle)
# 保存图片
frame.save("./drag1_cut.png")

time.sleep(1)

cjy = Chaojiying_Client(
    'username', 'password', '947550')  # 用户中心>>软件ID 生成一个替换 96001
im = open('./drag1_cut.png', 'rb').read()  # 本地图片文件路径 来替换 a.jpg 有时WIN系统须要//
obj = (cjy.PostPic(im, 9101))
print(obj)
# 识别坐标
location = obj['pic_str'].split(',')
# 获取x坐标
x = location[0]

print(x)
# 导入动作链
action = ActionChains(driver)

# 点击并长按slider
action.click_and_hold(slider_move).perform()

# 超级鹰默认从滑块中点开始滑，需要减掉滑块一半宽度，计算移动距离
distance = int(x) - int(slider_width)/2
# 动作链拖拽
# perform()方法立即执行 20,0表示x轴移动20个像素，y轴移动0个像素
action.move_by_offset(distance, 0).perform()

# 释放动作链
action.release().perform()

6.scrapy爬虫框架

功能:高性能的持久化存储，异步的数据下载，高性能的数据解析，分布式;

#安装
pip install scrapy

#创建项目
scrapy startproject proName www.xxx.com

cd proName

scrapy genspider yourspidername www.baidu.com

#配置文件修改ROBOTS协议
ROBOTSTXT_OBEY = False

#设置UA伪装
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36"

LOG_LEVEL = "error"

#执行爬虫
scrapy crawl yourspidername

scrapy数据解析

class DoubanSpider(scrapy.Spider):
    name = "douban"
    # allowed_domains = ["movie.douban.com"]
    start_urls = ["https://movie.douban.com/"]

    def parse(self, response):
        li = response.xpath(
            "//div[@id='screening']//ul[@class='ui-slide-content']/li")
        for i in li:
            # xpath返回的列表元素都是selector对象
            # extract_first()将列表中的第一个元素提取出来
            print(i.xpath("./@data-title").extract_first())

scrapy数据存储

基于终端指令存储:只能存储通过parse方法获得的数据，只能存储在本地文件

def parse(self, response):
        list = []
        li = response.xpath(
            "//div[@id='screening']//ul[@class='ui-slide-content']/li")
        for i in li:
            # xpath返回的列表元素都是selector对象
            # extract_first()将列表中的第一个元素提取出来
            movie_name = i.xpath("./@data-title").extract_first()
            # 获取电影评分
            movie_score = i.xpath("./@data-rate").extract_first()
            if (movie_name != None):
                list.append({
                    "movie_name": movie_name,
                    "movie_score": movie_score
                })
        return list

指令:scrapy crawl douban -o ./douban.scv

终端指令存储操作简单，但是也有局限性大的缺点

基于管道的存储:

流程:

在配置文件中开启管道

ITEM_PIPELINES = {
    #300表示优先级，越小越高
    "dbdyPro.pipelines.DbdyproPipeline": 300,
}

解析数据

在item类中定义相关属性

class DbdyproItem(scrapy.Item):
    # define the fields for your item here like:
    movie_name = scrapy.Field()
    movie_score = scrapy.Field()
    pass

将解析的数据封装存储到item对象中

将item对象提交给管道进行持久化操作

def parse(self, response):
        li = response.xpath(
            "//div[@id='screening']//ul[@class='ui-slide-content']/li")
        pipitem = DbdyproItem()
        for i in li:
            # xpath返回的列表元素都是selector对象
            # extract_first()将列表中的第一个元素提取出来
            movie_name = i.xpath("./@data-title").extract_first()
            # 获取电影评分
            movie_score = i.xpath("./@data-rate").extract_first()
            if (movie_name != None):
                pipitem['movie_name'] = movie_name
                pipitem['movie_score'] = movie_score
                # 提交到管道处理
                yield pipitem

在管道类的process_item中将items数据进行持久化操作，可以存到本地或者数据库中

from itemadapter import ItemAdapter


class DbdyproPipeline:
    fp = None
    # 重写父类方法：该方法只在开始爬虫的时候执行一次,文件只被打开一次

    def open_spider(self, spider):
        print("开始爬虫")
        self.fp = open("./douban.txt", "w", encoding="utf-8")

    # 接收爬虫类传过来的item
    # 每接收到一个item就会调用一次
    def process_item(self, item, spider):
        movie_name = item['movie_name']
        movie_score = item['movie_score']
        # 将数据写入文件
        self.fp.write(movie_name + ":" + movie_score + "\n")
        return item

    # 重写父类方法
    def close_spider(self, spider):
        print("结束爬虫")
        self.fp.close()

多管道(多平台)存储数据

配置管道配置

ITEM_PIPELINES = {
    # 300表示优先级，越小越高
    "dbdyPro.pipelines.DbdyproPipeline": 300,
    "dbdyPro.pipelines.MysqlPipeline": 301,
}

编写多个管道类

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
import pymysql


class DbdyproPipeline:
    fp = None
    # 重写父类方法：该方法只在开始爬虫的时候执行一次,文件只被打开一次

    def open_spider(self, spider):
        print("开始爬虫")
        self.fp = open("./douban.txt", "a", encoding="utf-8")

    # 接收爬虫类传过来的item
    # 每接收到一个item就会调用一次
    def process_item(self, item, spider):
        movie_name = item['movie_name']
        movie_score = item['movie_score']
        # 将数据写入文件
        self.fp.write(movie_name + ":" + movie_score + "\n")

        # 返回item,会传递给下一个即将执行的管道类
        return item

    # 重写父类方法
    def close_spider(self, spider):
        print("结束爬虫")
        self.fp.close()

# 一个管道类对应将一组数据存储到一个平台或载体中


class MysqlPipeline(object):
    # mysql数据库配置信息
    conn = None
    # mysql游标对象
    cursor = None

    # 重写父类方法：该方法只在开始爬虫的时候执行一次,文件只被打开一次
    def open_spider(self, spider):
        # 连接数据库
        print("数据库写入开始。。。")
        self.conn = pymysql.Connect(host="localhost", port=3306,
                                    user='root', password='root', db='py_douban_data', charset='utf8')

    # 编写process_item方法，用来接收item对象
    def process_item(self, item, spider):
        movie_name = item['movie_name']
        movie_score = item['movie_score']
        self.cursor = self.conn.cursor()
        sql = "insert into m_data (m_name,m_score) values('%s','%s')" % (
            movie_name, str(movie_score))
        # 执行sql语句
        # 进行事务处理
        try:
            self.cursor.execute(sql)
            self.conn.commit()
        except Exception as e:
            print(e)
            self.conn.rollback()

        return item

    # 关闭数据库
    def close_spider(self, spider):
        print("数据库写入结束。。。")
        self.cursor.close()
        self.conn.close()

多页面数据爬取，多页面的数据获取是异步的，所以保存的数据可能不会按照页面顺序存储

class GushiciSpider(scrapy.Spider):
    name = "gushici"
    # allowed_domains = ["www.gushici.net"]
    start_urls = ["https://www.gushici.net/mingju/"]

    # 爬取全栈数据
    # 生成通用的url模板
    url = "https://www.gushici.net/mingju/index_%d.html"
    start_page_num = 2

    def parse(self, response):
        boxs = response.xpath(
            "//div[@class='left']/div[@class='ju']/div[@class='ju-box']")

        item = PronameItem()
        for box in boxs:
            a_arr = box.xpath("./a")
            con = a_arr[0].xpath("./text()").extract_first()
            author = a_arr[1].xpath("./text()").extract_first()
            item['con'] = con
            item['author'] = author
            # 提交到管道处理
            yield item
            # 新的url
            if (self.start_page_num <= 3):
                new_url = self.url % self.start_page_num
                self.start_page_num += 1
                # 提交到管道处理,请求新的地址，回调执行parse方法
                yield scrapy.Request(new_url, callback=self.parse)

scrapy5大核心组件

parse方法跨页面爬取传参

import scrapy
from bossPro.items import BossproItem


class BossSpider(scrapy.Spider):
    name = "boss"

    # allowed_domains = ["www.zhipin.com"]
    # boss直聘是用ajax获取的，所有换个网站
    start_urls = [
        "https://www.gushici.net/mingju/index.html"]
    new_page_model="https://www.gushici.net/mingju/index_%d.html"    
    page_num=2

    def parse(self, response):

        li_list = response.xpath("//div[@class='ju']/div[@class='ju-box']")
        for li in li_list:
            title = li.xpath("./a[1]/text()").extract_first()

            link_url = li.xpath("./a[1]/@href").extract_first()
            #实例化不能写在循环外面，否则只会保存最后一条数据，因为item是可变的，每次重新赋值都会通过指针修改原来的值
            item = BossproItem()
            item['title'] = title

            # url地址全路径
            link_url = response.urljoin(link_url)
            # 请求传参,将item作为参数传递给回调函数
            yield scrapy.Request(link_url, callback=self.parse_detail, meta={'item': item})

        # 翻页
        if self.page_num <= 4:
            new_url = format(self.new_page_model % self.page_num)
            self.page_num += 1
            # print(new_url)
            # 递归调用
            yield scrapy.Request(new_url, callback=self.parse)

    def parse_detail(self, response):
        item = response.meta['item']
        content = response.xpath(
            "//div[@class='gushici-box-text']//a/text()")
        cons = content.extract()
        # cons拼接
        cons_p = "".join(cons)
        item['con_p'] = cons_p
        yield item

爬取图片

①解析图片路径

class ZztpSpider(scrapy.Spider):
    name = "zztp"
    # allowed_domains = ["sc.chinaz.com"]
    start_urls = ["https://acg.gamersky.com/pic/wallpaper/"]

    def parse(self, response):
        lis = response.xpath("//ul[@class='pictxt block contentpaging']/li")
        for li in lis:
            img_src = li.xpath("./a/img/@src").extract_first()
            item = PronameItem()
            item['img_src'] = img_src
            yield item

②设置item字段

class PronameItem(scrapy.Item):
    # define the fields for your item here like:
    img_src = scrapy.Field()
    # pass

③自定义管道类

# 导入图片管道类
from scrapy.pipelines.images import ImagesPipeline
import scrapy

# 定义自定义图片管道类,继承ImagesPipeline
class MyImagesPipeline(ImagesPipeline):
    # 重写父类的3个方法
    # 1.获取图片的请求
    def get_media_requests(self, item, info):
        # 获取图片的请求
        yield scrapy.Request(url=item['img_src'])

    # 2.图片的保存
    def file_path(self, request, response=None, info=None, *, item=None):
        # 获取图片的名称
        img_name = request.url.split('/')[-1]
        # 拼接图片的保存路径
        return img_name

    # 3.返回item到下一个管道类
    def item_completed(self, results, item, info):
        return item

④定义图片路径，开放管道类

ITEM_PIPELINES = {
    "proName.pipelines.MyImagesPipeline": 300,
}

# 设置图片的保存路径
IMAGES_STORE = "./images"

数据下载中间件

class MidDownloaderMiddleware:
    # 代理ip池
    PROXY_IP = [
        'http://ip:port',
        'http://ip:port',
    ]
    # https
    PROXY_IPS = [
        'https://ip:port',
        'https://ip:port',
    ]
    # UA池
    UA_LIST = [
        'ua1',
        'ua2',
    ]

    def process_request(self, request, spider):
        request.headers['User-Agent'] = random.choice(self.UA_LIST)
        return None

    def process_response(self, request, response, spider):

        return response

    def process_exception(self, request, exception, spider):
        # 如果ip被封禁发生异常，在这里替换代理ip
        # 从request中获取http或者https
        if request.url.split(':')[0] == 'https':
            request.meta['proxy'] = random.choice(self.PROXY_IPS)
        else:
            request.meta['proxy'] = random.choice(self.PROXY_IP)

        # 将替换后的request对象重新发送
        return request

DOWNLOADER_MIDDLEWARES = {
    "mid.middlewares.MidDownloaderMiddleware": 543,
}

结合selenium

class WySpider(scrapy.Spider):
    name = "wy"

    # 定义初始方法
    def __init__(self):
        self.driver = webdriver.Chrome(executable_path="../chromedriver.exe")

    # allowed_domains = ["163.com"]
    start_urls = ["https://money.163.com/"]
    navs = ["T1495590561605.html",
            "T1524534586033.html", "T1603455486882.html", "T1624330083376.html"]
    selenium_urls = []

    def parse(self, response):
        # 循环拼接地址
        for nav in self.navs:
            url = "https://www.163.com/dy/media/"+nav
            self.selenium_urls.append(url)
            yield scrapy.Request(url, callback=self.parse_nav)

    # 解析导航页面
    def parse_nav(self, response):
        list = response.xpath("//div[@class='tab_content']//li")
        for li in list:
            link_url = li.xpath("./a/@href").extract_first()
            title = li.xpath("./a/img/@alt").extract_first()
            item = WangyiItem()
            item['title'] = title
            item['link_url'] = link_url
            # print(link_url, title)
            if (link_url != None):
                yield scrapy.Request(link_url, callback=self.parse_detail, meta={'item': item})

    def parse_detail(self, response):
        item = response.meta['item']

        p_text = response.xpath(
            "//div[@class='post_body']/p/text()").extract_first()
        item['p_text'] = p_text

        #提交到管道
        yield item

    def closed(self, spider):
        self.driver.quit()

class WangyiItem(scrapy.Item):
    # define the fields for your item here like:
    title = scrapy.Field()
    link_url = scrapy.Field()
    p_text = scrapy.Field()

class WangyiDownloaderMiddleware:

    def process_request(self, request, spider):
        return None

    def process_response(self, request, response, spider):
        driver = spider.driver
        # 判断是否目标页面
        if request.url in spider.selenium_urls:
            # 这里展示引入selenium的方法
            driver.get(request.url)
            # 睡眠3秒，等待页面加载完成
            time.sleep(3)
            page_text = driver.page_source
            new_response = HtmlResponse(
                url=request.url, body=page_text, encoding="utf-8")
            return new_response
        else:
            return response

    def process_exception(self, request, exception, spider):
        pass

class WangyiPipeline:
    def process_item(self, item, spider):
        # 处理数据
        print(item)

        return item

全站爬取

#创建项目
scrapy startproject proName dianying.2345.com  

cd dianying.2345.com

scrapy genspider -t crawl movie2345  2345.com

class GscSpider(CrawlSpider):
    name = "gsc"
    # allowed_domains = ["gushici.com"]
    start_urls = ["https://www.gushici.net/"]

    # LinkExtractor(allow=r"Items/")  # 提取链接的规则，allow表示允许提取的链接，正则表达式
    url = LinkExtractor(allow=r"\/[a-z_0-9]+\/")
    # Follow=True表示跟进，如果为False则不跟进,递归爬取
    rules = (Rule(url, callback="parse_item", follow=True),)

    def parse_item(self, response):
        # 获取gushici-box的内容
        gushici_box = response.xpath('//div[@class="gushici-box"]')
        # 循环gushici_box 获取标题
        for box in gushici_box:
            title = box.xpath('./p[@class="tit"]//b/text()').extract_first()
            author = box.xpath(
                './p[@class="source"]//a/text()').extract()
            author_str = ""
            # 获取author的长度
            if (len(author)):
                author_str = author[0]+": "+author[1]

            content = box.xpath(
                './div[@class="gushici-box-text"]')
            content_str = content.xpath("string()").get()

            item=GushiciItem()
            item['title']=title
            item['author_str']=author_str
            item['content_str']=content_str

            yield item

class GushiciItem(scrapy.Item):
    # define the fields for your item here like:
    title = scrapy.Field()
    author_str = scrapy.Field()
    content_str = scrapy.Field()

from itemadapter import ItemAdapter


class GushiciPipeline:

    # 定义文件
    fp = None

    # 重写父类的方法
    def open_spider(self, spider):
        print("开始爬虫")
        self.fp = open("./gushici.txt", "a", encoding="utf-8")

    def process_item(self, item, spider):

        # 将item 写入文件
        if (item['title'] != None and item['author_str'] != None and item['content_str'] != None):
            self.fp.write(item['title']+"\n" +
                          item['author_str']+"\n"+item['content_str']+"\r\n\n")

        return item

    def close_spider(self, spider):
        print("结束爬虫")
        self.fp.close()

分布式爬取:

原生的scrapy不能用来做分布式爬取，因为不同服务器他们的调度器与管道是各自独立；所有需要安装scrapy-redis实现分布式爬取；

scrapy-redis可以为scrapy提供共享的调度器和管道