Scrapy+Selenium小爬虫项目

Scrapy+Selenium小爬虫项目

主要代码

spiders下的爬虫文件

import scrapy
from pai.items import PaiItem
from pai.pipelines import dbhandle
from selenium import webdriver
import pymysql

class PaiappSpider(scrapy.Spider):
    name = "paiapp"
    allowed_domains = ["xxxx.com"]
    start_urls = ["https://xxxxx.com"]

    def parse(self, response):
        # 解析网页
        urls_list = response.xpath('//*[@class="pc_card"]/@href').extract()
        Title_list = response.xpath('//*[@class="pc_card"]/div/text()').extract()
        tphoto_list = response.xpath('//*[@class="img_box"]/a/img/@src').extract()
        up_list = response.xpath('//*[@class="card_bottom pc_card" ]/*[position()=2]/div/span/text()').extract()
        com_list = response.xpath('//*[@class="card_bottom pc_card" ]/*[position()=2]/a/span/text()').extract()
        date_list = response.xpath('//*[@class="pic_box time"]//span/text()').extract()
        # 列表切分成字段
        for url, Title, tphoto,up,com,date in zip(urls_list,Title_list, tphoto_list,up_list,com_list,date_list):
            # item导入
            item = PaiItem()
            item['url'] = url
            item['title'] = Title
            item['tphoto'] = tphoto
            item['up'] = up
            item['com'] = com
            item['date'] = date
            print(url)
            print(Title)
            print(tphoto)
            print(up)
            print(com)
            print(date)
            yield item
        # 去重写这里
        #去重
        dbObject = dbhandle()
        cursor = dbObject.cursor()
        cursor.execute("USE pai")
        dt = "DELETE FROM paiapp WHERE url IN ( SELECT * FROM ( SELECT url FROM paiapp GROUP BY url HAVING count( url ) > 1 ) a ) AND id NOT IN ( SELECT * FROM ( SELECT min( id ) AS id FROM paiapp GROUP BY url HAVING count( url ) > 1 ) b )"
        cursor.execute(dt)
        cursor.connection.commit()
        print("数据以去重")

items

import scrapy
class PaiItem(scrapy.Item):
    # define the fields for your item here like:
    url = scrapy.Field()
    title = scrapy.Field()
    tphoto = scrapy.Field()
    up = scrapy.Field()
    com = scrapy.Field()
    date = scrapy.Field()

middlewares

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver import ActionChains, Keys
import json
from scrapy.http import HtmlResponse
import time

class PaiDownloaderMiddleware:
    def __init__(self):
        # 配置ChromeOptions以启用无头模式
        chrome_options = Options()
        # 添加无头参数
        # chrome_options.add_argument("--headless") (打开需要额外配置)
        # 创建一个driver对象
        self.driver = webdriver.Chrome(options=chrome_options)

    def process_request(self,request,spider):
        # 访问建立链接
        self.driver.get(request.url)
        time.sleep(1)
        button = self.driver.find_element(By.XPATH,'//*[@id="app"]/div[1]/div[3]//div[1]/div/ul/li[3]')
        button.click() # 点击进入要爬取的栏目
        self.driver.maximize_window()
        time.sleep(2)
        # 等待元素出现
        # 等待页面上的特定元素出现
        try:
            #  刷新到底
            self.reflash()
            # 获取页面源码
            page_source = self.driver.page_source
            return HtmlResponse(url=request.url, body=page_source, request=request, encoding='utf-8')
        except Exception as e:
            spider.logger.error(f"Error")
            return HtmlResponse(url=request.url, status=500, request=request)

    def closed(self,spider):
        self.driver.quit()


    def reflash(self):
        for i in range(40): # 这里设置翻页次数
            try:
                # 寻找按钮并点击 如果没有就执行except
                a = self.driver.find_element(By.XPATH, '//*[@class="loadingMore"]')
                a.click()
            except:
                # 向下滚动一页
                self.driver.execute_script('window.scrollBy(0, window.innerHeight);')
            time.sleep(2)

pipelines

from itemadapter import ItemAdapter
import pymysql

def dbhandle():
    conn = pymysql.connect(
        host = "localhost",
        user = "user",
        passwd = "*****",
        charset = "utf8",
        use_unicode = False
    )
    return conn

class PaiPipeline:
    def process_item(self, item, spider):
        dbObject = dbhandle()
        cursor = dbObject.cursor()
        # 选择数据库
        cursor.execute("USE pai")
        # 插入表
        sql = "INSERT INTO paiapp(url,title,tphoto,up,com,date) VALUES(%s,%s,%s,%s,%s,%s)"
        try:

            cursor.execute(sql, (item['url'], item['title'], item['tphoto'],item['up'],item['com'],item['date']))
            cursor.connection.commit()
        except BaseException as a:
            print("错误在这里>>>>>>>>>>>>>>>>>>>>>", a, "<<<<<<<<<<<<<<<<<<<<<<<<")
        return item

settings

BOT_NAME = "pai"

SPIDER_MODULES = ["pai.spiders"]
NEWSPIDER_MODULE = "pai.spiders"

LOG_LEVEL = 'WARNING'

ROBOTSTXT_OBEY = False

DOWNLOADER_MIDDLEWARES = {
   "pai.middlewares.PaiDownloaderMiddleware": 543,
}

ITEM_PIPELINES = {
   "pai.pipelines.PaiPipeline": 300,
}

REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
FEED_EXPORT_ENCODING = "utf-8"

requirements.text

attrs               23.2.0
Automat             22.10.0
beautifulsoup4      4.12.3
bs4                 0.0.2
certifi             2024.6.2
cffi                1.16.0
charset-normalizer  3.3.2
constantly          23.10.4
cryptography        42.0.8
cssselect           1.2.0
defusedxml          0.7.1
et-xmlfile          1.1.0
filelock            3.14.0
h11                 0.14.0
hyperlink           21.0.0
idna                3.7
incremental         22.10.0
itemadapter         0.9.0
itemloaders         1.3.1
jmespath            1.0.1
lxml                5.2.2
numpy               1.26.4
openpyxl            3.1.3
outcome             1.3.0.post0
packaging           24.1
pandas              2.2.2
parsel              1.9.1
pip                 24.0
Protego             0.3.1
pyasn1              0.6.0
pyasn1_modules      0.4.0
pycparser           2.22
PyDispatcher        2.0.7
PyMySQL             1.1.1
pyOpenSSL           24.1.0
PySocks             1.7.1
python-dateutil     2.9.0.post0
pytz                2024.1
queuelib            1.7.0
requests            2.32.3
requests-file       2.1.0
Scrapy              2.11.2
selenium            4.21.0
service-identity    24.1.0
setuptools          70.0.0
six                 1.16.0
sniffio             1.3.1
sortedcontainers    2.4.0
soupsieve           2.5
tldextract          5.1.2
trio                0.25.1
trio-websocket      0.11.1
Twisted             24.3.0
twisted-iocpsupport 1.0.4
typing_extensions   4.12.2
tzdata              2024.1
urllib3             2.2.1
w3lib               2.2.1
wsproto             1.2.0
zope.interface      6.4.post2

遇到的一些问题:

小窗口使用模拟下拉翻页的时候老是会忽略一些懒加载的图片
原因:
小窗口爬取翻页跟全屏翻页坐标不一样(有的网站小窗口会自适应改变比例所以会漏掉一些懒加载的图片),目前不清楚怎么在小窗口下解决这个问题。

解决方法:全屏webdriver爬取

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值