scrapy 存储MySQL数据库

需要安装:

pip install pymysql

代码:

import scrapy
import pymysql  # 导入 pymysql
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
import time
import os
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import NoSuchElementException
from bs4 import BeautifulSoup

class SavearrSpider(scrapy.Spider):
    name = "savearr"
    allowed_domains = ["***.com"]
    start_urls = ["https://www.***.com"]

    def __init__(self, *args, **kwargs):
        super(SavearrSpider, self).__init__(*args, **kwargs)
        chrome_options = Options()
        chrome_options.add_argument("--start-maximized")  # 启动最大化
        chrome_options.add_argument("--disable-extensions")  # 禁用扩展程序
        chrome_options.add_argument("--disable-gpu")  # 禁用GPU加速
        driver_path = r'D:\software\chromedriver\chromedriver127\chromedriver-win64\chromedriver.exe'  # 确保chromedriver在环境变量中
        service = Service(executable_path=driver_path)
        self.driver = webdriver.Chrome(service=service, options=chrome_options)

        # 初始化数据库连接
        self.db_connection = pymysql.connect(
            host='localhost',
            user='root',  # 替换为您的MySQL用户名
            password='root',  # 替换为您的MySQL密码
            database='test',  # 数据库名
            charset='utf8mb4',
            cursorclass=pymysql.cursors.DictCursor
        )

    def parse(self, response):
        # 使用 Selenium 访问起始URL
        self.driver.get(response.url)
        # input("请在浏览器中进行操作,然后在这里按回车键继续...")
        # new_url = self.driver.current_url
        # self.driver.get(new_url)
        page_source = self.driver.page_source
        soup = BeautifulSoup(page_source, 'html.parser')
        gdposter_box = soup.find('div', class_='gdposter_box')
        if gdposter_box:
            table = gdposter_box.find('ul')
            if table:
                companies = table.find_all('li')
                company_data = []
                for company in companies:
                    company_info = {}
                    company_info['title'] = company.find('p').get_text()
                    company_data.append(company_info)

                # 保存数据到MySQL
                self.save_to_mysql(company_data)
            else:
                print("未找到指定的  元素。")
        else:
            print("未找到 元素。")

        self.driver.quit()

    def save_to_mysql(self, company_data):
        with self.db_connection.cursor() as cursor:
            for company in company_data:
                sql = "INSERT INTO companies (title) VALUES (%s)"
                cursor.execute(sql, (company['title'],))
            self.db_connection.commit()

    def close(self, reason):
        self.driver.quit()
        self.db_connection.close()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值