目录结构
articleSpider.py
from scrapy.selector import Selector
from scrapy import Spider
from firstscrapy.items import FirstscrapyItem
class ArticleSpider(Spider):
name="csdn"
allowed_domains = ["https://blog.csdn.net"]
start_urls = ["https://blog.csdn.net/wei_zhen_dong",]
def parse(self, response):
# xpath利用谷歌浏览器copy对应信息
papers = response.xpath("//*[@id='mainBox']/main/div[2]/div[1]")
for pap in papers:
title = pap.xpath("//h4/a/text()").re('.*')
url = pap.xpath("//h4/a/@href").extract()
read_num = pap.xpath("//div[1]/p[3]/span/span/text()").extract()
time = pap.xpath("//div[1]/p[1]/span/text()").extract()
# 稍稍清洗一下数据
title1 = []
for i in range(0, len(title)):
if title[i].replace(" ", "") != "":
title1.append(title[i])
time1 = []
for i in range(2, len(time)):
time1.append(time[i])
item = FirstscrapyItem(title = title1, url = url, read_num = read_num, time = time1)
yield item
items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class FirstscrapyItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
url = scrapy.Field()
read_num = scrapy.Field()
time = scrapy.Field()
pass
pipelines.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy.exceptions import DropItem
from opdata.opexcel import Operatingexcel
class FirstscrapyPipeline(object):
def __init__(self):
self.op = Operatingexcel()
def process_item(self, item, spider):
print(item)
if item['title']:
self.op.set_excel_dic(item,"data\csdn_data.xlsx",0,0)
print(item['title'])
return item
else:
return "数据丢失"
settings.py
加入以下代码,用以启动pipelines
ITEM_PIPELINES = {
'firstscrapy.pipelines.FirstscrapyPipeline': 300,
}
opexcel.py
自己写的操作excel的类,只有两个功能,有时间应该完善一下,我用的挺方便的,哈哈哈…
import xlrd
import xlwt
from xlutils.copy import copy
class Operatingexcel():
def get_excel_dic(self,filename,sheetname):
# filename 文件名
# sheetname 表单名
# 返回字典格式
dic = {}
data = xlrd.open_workbook(filename, 'r', encoding_override='utf-8')
table = data.sheet_by_name(sheetname)
for i in range(1, table.nrows):
for y in range(len(table.row_values(0))):
if table.row_values(i)[y] != "":
dic.setdefault(table.row_values(0)[y], []).append(table.row_values(i)[y])
return dic
def get_excel_list(self,filename,sheetname):
# filename 文件名
# sheetname 表单名
# 返回列表格式
list = []
data = xlrd.open_workbook(filename, 'r', encoding_override='utf-8')
table = data.sheet_by_name(sheetname)
for y in range(table.nrows):
for x in range(len(table.row_values(0))):
if table.row_values(y)[x] != "":
list.append(table.row_values(y)[x])
return list
def set_excel_dic(self,dic,filename,sheet_index,start_r):
# filename 文件名
# sheet_index第几个工作表格
# start_r那一列
x = start_r
for k in dic.keys():
list = []
list.append(k)
for v in dic[k]:
list.append(v)
self.set_excel_list(list,filename,sheet_index,x)
x = x + 1
def set_excel_list(self,list,filename,sheet_index,start_r):
# filename 文件名
# sheet_index第几个工作表格
# start_r那一列
# 读取excel文件
r_xls = xlrd.open_workbook(filename)
# 将xlrd的对象转化为xlwt的对象
excel = copy(r_xls)
table = excel.get_sheet(sheet_index)
for y in range(len(list)):
if str(list[y]).split() !="":
table.write(y,start_r,str(list[y]).split())
else:
continue
excel.save(filename)
运行结果
评论和点赞的数据几乎没有,这里就不进行获取了…