scrapy selenium爬取csdn博客信息并进行数据清洗存储到mysql

直接上代码

  1. spiders代码:
import scrapy
import lxml.html
from scrapy import Request
import re
from CsDn.items import CsdnItem
number = 1
class ExampleSpider(scrapy.Spider):

    name = 'csdn'
    def start_requests(self):
        for i in range(1):
            url_str = 'https://so.csdn.net/so/search/s.do?q=python&t=blog&o=&s=&l='
            yield Request(url=url_str,callback=self.parse,dont_filter=True,meta={'page':'1'})
    def parse(self, response):
        contents = response.xpath('//dl[@class="search-list J_search"]').extract()
        for i in contents:
            cont = lxml.html.fromstring(i)
            if cont.xpath('.//dd[@class="author-time"]/a//text()'):
                url = cont.xpath('.//dd[@class="search-link"]/a/@href')
                author = cont.xpath('.//dd[@class="author-time"]/a//text()')
                title = ''.join(cont.xpath('.//dt/a[1]//text()'))
                post_info = cont.xpath('.//dd[@class="author-time"]/text()')
                comments = 0
                post_time = self.get_post_time(post_info)
                count_views = self.get_count_views(post_info)
                item = CsdnItem()
                item['url'] = ''.join(url)
                item['author'] = ''.join(author)
                item['post_time']= post_time
                item['count_views'] = count_views
                item['comments'] = comments
                item['title'] = title
                yield item
        global number
        number += 1
        yield Request(url=response.url, callback=self.parse, dont_filter=True, meta={'page':'2','number':number})

    def get_count_views(self,post_infoo):
        post_info = ''.join(post_infoo)
        count_views = re.search('浏览(.+)次',post_info).group(1)
        count_view = re.search('[0-9]+',count_views).group()
        return int(count_view)

    def get_post_time(self,post_infoo):
        post_info = ''.join(post_infoo)
        post_time = re.search('日期:(.+) ',post_info).group(1)
        return post_time
  1. 中间件
from selenium import webdriver
from selenium.webdriver.firefox.options import Options as FOptions
import time
from scrapy.http import HtmlResponse
class SeleniumMiddlewares(object):

    def __init__(self):

        self.options = FOptions()
        #self.options.add_argument("-headless")
        self.browser = webdriver.Firefox(executable_path="/home/hello/Downloads/geckodriver",firefox_options=self.options)
    def process_request(self,request,spider):
        if int(request.meta['page']) == 2:
            self.browser.get(request.url)
            time.sleep(3)
            self.browser.execute_script("window.scrollTo(0,document.body.scrollHeight)")
            time.sleep(2)
            input = self.browser.find_element_by_xpath('//span/input')
            input.clear()
            input.send_keys(request.meta['number'])
            btn = self.browser.find_element_by_xpath('//span[@class="page-go"]/button')
            btn.click()
            time.sleep(3)
        if int(request.meta['page']) == 1:
            self.browser.get(request.url)
            time.sleep(2)
        return HtmlResponse(url=self.browser.current_url,body=self.browser.page_source,encoding="utf-8",request=request)
  1. pipelines
import pymysql

class CsdnPipeline(object):

    def open_spider(self, spider):
        # 连接数据库
        # self.conn = pymysql.Connect(host='127.0.0.1', port=3306, user='root', password='123456', db='movie', charset='utf8')
        # 将配置文件读到内存中,是一个字典
        host = '47.75.81.75'   # 远程主机端口号
        port = 3306         #端口号不带引号哟
        user = 'user_name'
        password = 'password'
        dbname = 'database_name'
        dbcharset = 'utf8'
        self.conn = pymysql.Connect(host=host, port=port, user=user, password=password, db=dbname, charset=dbcharset)

    def process_item(self, item, spider):
        # 写入数据库中,blogs_csdn 是数据表名字
        sql = 'insert into blogs_csdn(url, author, post_time, count_views,comments,title) values("%s", "%s", "%s", "%s", "%s", "%s")' % (
        item['url'], item['author'], item['post_time'], item['count_views'], item['comments'], item['title'])
        # 执行sql语句
        self.cursor = self.conn.cursor()
        try:
            self.cursor.execute(sql)
            print('#' * 10 + '保存成功啦')
            self.conn.commit()
        except Exception as e:
            print('*' * 10 + '下载出错啦')
            print(e)
            self.conn.rollback()
        return item

    def close_spider(self, spider):
        self.cursor.close()
        self.conn.close()

  1. items
import scrapy

class CsdnItem(scrapy.Item):
    url = scrapy.Field()
    author = scrapy.Field()
    post_time = scrapy.Field()
    count_views = scrapy.Field()
    comments = scrapy.Field()
    title =  scrapy.Field()
  1. 存储结果如图:
    mysql展示图
  • 2
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
好的,您可以按照以下步骤使用scrapyselenium爬取考研信息并保存到csv中: 1. 安装scrapyselenium库,并下载相应浏览器的驱动器。例如,如果您使用的是Chrome浏览器,则需要下载chrome driver并添加到环境变量中。 2. 创建一个scrapy项目,并在settings.py中添加selenium middleware。 3. 创建一个spider,使用selenium库打开浏览器,并在浏览器中模拟用户操作,例如输入搜索关键词,点击搜索按钮等。 4. 使用selenium库获取数据,例如爬取网页源代码或者特定元素的文本。 5. 处理数据,例如提取所需信息并保存到本地文件或数据库中。这里我们选择将数据保存到csv中。 下面是一个示例代码,用于在“研招网”上搜索“计算机”相关考研信息并将相关信息保存到csv中: 1. 在settings.py中添加selenium middleware ```python DOWNLOADER_MIDDLEWARES = { 'myproject.middlewares.SeleniumMiddleware': 543, } ``` 2. 创建一个spider ```python import csv from scrapy import Spider from scrapy.selector import Selector from myproject.items import ExamInfoItem from scrapy.http import Request from selenium import webdriver from scrapy.xlib.pydispatch import dispatcher from scrapy import signals class ExamSpider(Spider): name = "exam" allowed_domains = ["yz.chsi.com.cn"] start_urls = ( 'https://yz.chsi.com.cn/zsml/queryAction.do', ) def __init__(self): self.driver = webdriver.Chrome() dispatcher.connect(self.spider_closed, signals.spider_closed) def spider_closed(self, spider): self.driver.quit() def parse(self, response): self.driver.get(response.url) # 输入搜索关键词并点击搜索按钮 search_box = self.driver.find_element_by_name("mldm") search_box.send_keys("0852") search_button = self.driver.find_element_by_class_name("searchbtn") search_button.click() # 爬取相关内容并保存到csv中 result_box = self.driver.find_element_by_id("resultTable") result_items = result_box.find_elements_by_tag_name("tr") with open('exam_info.csv', 'w', encoding='utf-8', newline='') as f: writer = csv.writer(f) for item in result_items: row = item.text.split() writer.writerow(row) # 爬取其他页面的信息 # ... self.driver.quit() ``` 3. 创建一个item ```python from scrapy import Item, Field class ExamInfoItem(Item): # 考试名称 exam_name = Field() # 考试时间 exam_time = Field() # 考试地点 exam_location = Field() # 考试科目 exam_subject = Field() # 考试类型 exam_type = Field() # 考试报名时间 exam_reg_time = Field() # 考试报名费用 exam_reg_fee = Field() # 考试网址 exam_website = Field() ``` 注意:在使用selenium时请遵守网站的爬取规则,不要过度频繁地访问同一网站,以免被封禁IP。另外,注意对csv文件的编码和换行符的处理。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值