Scrapy+Selenium小爬虫项目
主要代码
spiders下的爬虫文件
import scrapy
from pai.items import PaiItem
from pai.pipelines import dbhandle
from selenium import webdriver
import pymysql
class PaiappSpider(scrapy.Spider):
name = "paiapp"
allowed_domains = ["xxxx.com"]
start_urls = ["https://xxxxx.com"]
def parse(self, response):
# 解析网页
urls_list = response.xpath('//*[@class="pc_card"]/@href').extract()
Title_list = response.xpath('//*[@class="pc_card"]/div/text()').extract()
tphoto_list = response.xpath('//*[@class="img_box"]/a/img/@src').extract()
up_list = response.xpath('//*[@class="card_bottom pc_card" ]/*[position()=2]/div/span/text()').extract()
com_list = response.xpath('//*[@class="card_bottom pc_card" ]/*[position()=2]/a/span/text()').extract()
date_list = response.xpath('//*[@class="pic_box time"]//span/text()').extract()
# 列表切分成字段
for url, Title, tphoto,up,com,date in zip(urls_list,Title_list, tphoto_list,up_list,com_list,date_list):
# item导入
item = PaiItem()
item['url'] = url
item['title'] = Title
item['tphoto'] = tphoto
item['up'] = up
item['com'] = com
item['date'] = date
print(url)
print(Title)
print(tphoto)
print(up)
print(com)
print(date)
yield item
# 去重写这里
#去重
dbObject = dbhandle()
cursor = dbObject.cursor()
cursor.execute("USE pai")
dt = "DELETE FROM paiapp WHERE url IN ( SELECT * FROM ( SELECT url FROM paiapp GROUP BY url HAVING count( url ) > 1 ) a ) AND id NOT IN ( SELECT * FROM ( SELECT min( id ) AS id FROM paiapp GROUP BY url HAVING count( url ) > 1 ) b )"
cursor.execute(dt)
cursor.connection.commit()
print("数据以去重")
items
import scrapy
class PaiItem(scrapy.Item):
# define the fields for your item here like:
url = scrapy.Field()
title = scrapy.Field()
tphoto = scrapy.Field()
up = scrapy.Field()
com = scrapy.Field()
date = scrapy.Field()
middlewares
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver import ActionChains, Keys
import json
from scrapy.http import HtmlResponse
import time
class PaiDownloaderMiddleware:
def __init__(self):
# 配置ChromeOptions以启用无头模式
chrome_options = Options()
# 添加无头参数
# chrome_options.add_argument("--headless") (打开需要额外配置)
# 创建一个driver对象
self.driver = webdriver.Chrome(options=chrome_options)
def process_request(self,request,spider):
# 访问建立链接
self.driver.get(request.url)
time.sleep(1)
button = self.driver.find_element(By.XPATH,'//*[@id="app"]/div[1]/div[3]//div[1]/div/ul/li[3]')
button.click() # 点击进入要爬取的栏目
self.driver.maximize_window()
time.sleep(2)
# 等待元素出现
# 等待页面上的特定元素出现
try:
# 刷新到底
self.reflash()
# 获取页面源码
page_source = self.driver.page_source
return HtmlResponse(url=request.url, body=page_source, request=request, encoding='utf-8')
except Exception as e:
spider.logger.error(f"Error")
return HtmlResponse(url=request.url, status=500, request=request)
def closed(self,spider):
self.driver.quit()
def reflash(self):
for i in range(40): # 这里设置翻页次数
try:
# 寻找按钮并点击 如果没有就执行except
a = self.driver.find_element(By.XPATH, '//*[@class="loadingMore"]')
a.click()
except:
# 向下滚动一页
self.driver.execute_script('window.scrollBy(0, window.innerHeight);')
time.sleep(2)
pipelines
from itemadapter import ItemAdapter
import pymysql
def dbhandle():
conn = pymysql.connect(
host = "localhost",
user = "user",
passwd = "*****",
charset = "utf8",
use_unicode = False
)
return conn
class PaiPipeline:
def process_item(self, item, spider):
dbObject = dbhandle()
cursor = dbObject.cursor()
# 选择数据库
cursor.execute("USE pai")
# 插入表
sql = "INSERT INTO paiapp(url,title,tphoto,up,com,date) VALUES(%s,%s,%s,%s,%s,%s)"
try:
cursor.execute(sql, (item['url'], item['title'], item['tphoto'],item['up'],item['com'],item['date']))
cursor.connection.commit()
except BaseException as a:
print("错误在这里>>>>>>>>>>>>>>>>>>>>>", a, "<<<<<<<<<<<<<<<<<<<<<<<<")
return item
settings
BOT_NAME = "pai"
SPIDER_MODULES = ["pai.spiders"]
NEWSPIDER_MODULE = "pai.spiders"
LOG_LEVEL = 'WARNING'
ROBOTSTXT_OBEY = False
DOWNLOADER_MIDDLEWARES = {
"pai.middlewares.PaiDownloaderMiddleware": 543,
}
ITEM_PIPELINES = {
"pai.pipelines.PaiPipeline": 300,
}
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
FEED_EXPORT_ENCODING = "utf-8"
requirements.text
attrs 23.2.0
Automat 22.10.0
beautifulsoup4 4.12.3
bs4 0.0.2
certifi 2024.6.2
cffi 1.16.0
charset-normalizer 3.3.2
constantly 23.10.4
cryptography 42.0.8
cssselect 1.2.0
defusedxml 0.7.1
et-xmlfile 1.1.0
filelock 3.14.0
h11 0.14.0
hyperlink 21.0.0
idna 3.7
incremental 22.10.0
itemadapter 0.9.0
itemloaders 1.3.1
jmespath 1.0.1
lxml 5.2.2
numpy 1.26.4
openpyxl 3.1.3
outcome 1.3.0.post0
packaging 24.1
pandas 2.2.2
parsel 1.9.1
pip 24.0
Protego 0.3.1
pyasn1 0.6.0
pyasn1_modules 0.4.0
pycparser 2.22
PyDispatcher 2.0.7
PyMySQL 1.1.1
pyOpenSSL 24.1.0
PySocks 1.7.1
python-dateutil 2.9.0.post0
pytz 2024.1
queuelib 1.7.0
requests 2.32.3
requests-file 2.1.0
Scrapy 2.11.2
selenium 4.21.0
service-identity 24.1.0
setuptools 70.0.0
six 1.16.0
sniffio 1.3.1
sortedcontainers 2.4.0
soupsieve 2.5
tldextract 5.1.2
trio 0.25.1
trio-websocket 0.11.1
Twisted 24.3.0
twisted-iocpsupport 1.0.4
typing_extensions 4.12.2
tzdata 2024.1
urllib3 2.2.1
w3lib 2.2.1
wsproto 1.2.0
zope.interface 6.4.post2
遇到的一些问题:
小窗口使用模拟下拉翻页的时候老是会忽略一些懒加载的图片
原因:
小窗口爬取翻页跟全屏翻页坐标不一样(有的网站小窗口会自适应改变比例所以会漏掉一些懒加载的图片),目前不清楚怎么在小窗口下解决这个问题。
解决方法:全屏webdriver爬取