利用Scrapy框架爬取csdn博客数据,并保存到excel

目录结构
目录
articleSpider.py

from scrapy.selector import Selector
from scrapy import Spider
from firstscrapy.items import FirstscrapyItem
class ArticleSpider(Spider):
     name="csdn"
     allowed_domains = ["https://blog.csdn.net"]
     start_urls = ["https://blog.csdn.net/wei_zhen_dong",]
     def parse(self, response):
         # xpath利用谷歌浏览器copy对应信息
         papers = response.xpath("//*[@id='mainBox']/main/div[2]/div[1]")
         for pap in papers:
             title = pap.xpath("//h4/a/text()").re('.*')
             url = pap.xpath("//h4/a/@href").extract()
             read_num = pap.xpath("//div[1]/p[3]/span/span/text()").extract()
             time = pap.xpath("//div[1]/p[1]/span/text()").extract()
             # 稍稍清洗一下数据
             title1 = []
             for i in range(0, len(title)):
                 if title[i].replace(" ", "") != "":
                   title1.append(title[i])
             time1 = []
             for i in range(2, len(time)):
                 time1.append(time[i])
             item = FirstscrapyItem(title = title1, url = url, read_num = read_num, time = time1)
             yield item

items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


class FirstscrapyItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    title = scrapy.Field()
    url = scrapy.Field()
    read_num = scrapy.Field()
    time = scrapy.Field()
    pass

pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html

from scrapy.exceptions import DropItem
from opdata.opexcel import Operatingexcel
class FirstscrapyPipeline(object):
    def __init__(self):
        self.op = Operatingexcel()
    def process_item(self, item, spider):
        print(item)
        if item['title']:
            self.op.set_excel_dic(item,"data\csdn_data.xlsx",0,0)
            print(item['title'])
            return item
        else:
            return "数据丢失"

settings.py
加入以下代码,用以启动pipelines

ITEM_PIPELINES = {
   'firstscrapy.pipelines.FirstscrapyPipeline': 300,
}

opexcel.py
自己写的操作excel的类,只有两个功能,有时间应该完善一下,我用的挺方便的,哈哈哈…

import xlrd
import xlwt
from xlutils.copy import copy
class Operatingexcel():
    def get_excel_dic(self,filename,sheetname):
        # filename 文件名
        # sheetname 表单名
        # 返回字典格式
        dic = {}
        data = xlrd.open_workbook(filename, 'r', encoding_override='utf-8')
        table = data.sheet_by_name(sheetname)
        for i in range(1, table.nrows):
            for y in range(len(table.row_values(0))):
                if table.row_values(i)[y] != "":
                    dic.setdefault(table.row_values(0)[y], []).append(table.row_values(i)[y])
        return dic

    def get_excel_list(self,filename,sheetname):
        # filename 文件名
        # sheetname 表单名
        # 返回列表格式
        list = []
        data = xlrd.open_workbook(filename, 'r', encoding_override='utf-8')
        table = data.sheet_by_name(sheetname)
        for y in range(table.nrows):
            for x in range(len(table.row_values(0))):
                if table.row_values(y)[x] != "":
                    list.append(table.row_values(y)[x])
        return list

    def set_excel_dic(self,dic,filename,sheet_index,start_r):
        # filename 文件名
        # sheet_index第几个工作表格
        # start_r那一列

        x = start_r
        for k in dic.keys():
            list = []
            list.append(k)
            for v in dic[k]:
                list.append(v)
            self.set_excel_list(list,filename,sheet_index,x)
            x = x + 1

    def set_excel_list(self,list,filename,sheet_index,start_r):
        # filename 文件名
        # sheet_index第几个工作表格
        # start_r那一列

        # 读取excel文件
        r_xls = xlrd.open_workbook(filename)
        # 将xlrd的对象转化为xlwt的对象
        excel = copy(r_xls)
        table = excel.get_sheet(sheet_index)
        for y in range(len(list)):
            if str(list[y]).split() !="":
                table.write(y,start_r,str(list[y]).split())
            else:
                continue
        excel.save(filename)

运行结果
评论和点赞的数据几乎没有,这里就不进行获取了…
运行结果

  • 2
    点赞
  • 10
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值