目标:爬我本人的csdn博客的文章、链接等。
首先创建好爬虫项目
items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class MywebItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
Link = scrapy.Field()
read = scrapy.Field()
comment = scrapy.Field()
date = scrapy.Field()
title = scrapy.Field()
articletype = scrapy.Field()
创建模板文件 myspider.py
myspider.py
# -*- coding: utf-8 -*-
import scrapy
from myweb.items import MywebItem
from scrapy.http import Request
class MyspiderSpider(scrapy.Spider):
name = 'myspider'
allowed_domains = ['blog.csdn.net']
start_urls = ['https://blog.csdn.net/weixin_43614688/article/list/1?']
def start_requests(self):
yield Request("https://blog.csdn.net/weixin_43614688/article/list/1?",
headers={"User-Agent":"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36"})
def parse(self, response):
Item = MywebItem()
Item['title'] = response.xpath('//div[@class="article-item-box csdn-tracking-statistics"]//h4[@class=""]/a/text()').extract()
Item['articletype']=response.xpath('//div[@class="article-item-box csdn-tracking-statistics"]//h4[@class=""]/a/span/text()').extract()
Item['date']=response.xpath('//div[@class="info-box d-flex align-content-center"]//span[@class="date"]/text()').extract()
Item['Link']=response.xpath('//div[@class="article-item-box csdn-tracking-statistics"]//h4[@class=""]/a/@href').extract()
yield Item
yield Request("https://blog.csdn.net/weixin_43614688/article/list/2?",
callback = self.parse,
headers={"User-Agent":"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36"})
# -*- coding: utf-8 -*-
import pymysql
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
class MywebPipeline(object):
def __init__(self):
self.conn = pymysql.connect(host='localhost',user='root',passwd='lkm',db='bank')
self.cursor = self.conn.cursor(cursor=pymysql.cursors.DictCursor)
def process_item(self, item, spider):
for j in range(len(item["Link"])):
title = item['title'][2*j+1]
articletype = item['articletype'][j]
date = item['date'][j]
link = item['Link'][j]
sql = "insert into myblog(title, articletype, date, link) values('"+title+"','"+articletype+"','"+date+"','"+link+"')"
self.cursor.execute(sql)
return item
def close_spider(self, spider):
self.cursor.connection.commit()
self.cursor.close()
在 setting.py 文件中进行相应的配置
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'myweb.pipelines.MywebPipeline': 300,
}
mysql中创建好数据表之后就可以开始运行程序了。
调试:
在调试时可能会遇到一个比较隐晦的报错信息, ERROR: Error processing, 这时只要安装下面的提示找出报错位置进行检查即可。
因为创建表时title字段的定义是varchar(30),有的title超过了这个长度,导致运行到一半程序终止,这时需要修改定义:
alter table myblog modify title varchar(100);
在排除错误之后,程序正常运行
select * from myblog;