爬取自己在CSDN博客的数据(https://blog.csdn.net/lixinkuan328/article/list/1),并保存在MongoDB中。
- 使用Scrapy命令生成项目工程和爬虫类:
scrapy startproject csdn
scrapy genspider csdnspider blog.csdn.net
编码步骤
1.items.py
# -*- coding: utf-8 -*-
import scrapy
class CsdnItem(scrapy.Item):
#w文章名称
name = scrapy.Field()
#网址
csdnUrl = scrapy.Field()
#内容概括
content = scrapy.Field()
#发布时间
creatTime = scrapy.Field()
#阅读数
readNum = scrapy.Field()
2.spiders/DoubanSpider.py
# -*- coding: utf-8 -*-
import scrapy
from csdn.items import CsdnItem
class CsdnspiderSpider(scrapy.Spider):
name = 'csdnspider'
allowed_domains = ['blog.csdn.net']
offset = 1
url = "https://blog.csdn.net/lixinkuan328/article/list/"
start_urls = (
url + str(offset),
)
def parse(self, response):
item = CsdnItem()
articles = response.xpath("//div[@class='article-item-box csdn-tracking-statistics']")
for each in articles:
item['name'] = each.xpath("./h4/a//text()").extract()[2].strip()
item['csdnUrl'] = each.xpath("./h4/a/@href").extract()[0].strip()
item['content'] = each.xpath("./p[@class='content']/a/text()").extract()[0].strip()
item['creatTime'] = each.xpath("./div//span[@class='date']/text()").extract()[0].strip()
item['readNum'] = each.xpath("./div//span[@class='num']/text()").extract()[0].strip()
yield item
if self.offset < 13:
self.offset += 1
yield scrapy.Request(self.url + str(self.offset), callback=self.parse)
3.pipelines.py
# -*- coding: utf-8 -*-
import pymysql
class CsdnPipeline(object):
def __init__(self):
self.connect = pymysql.connect(
host='192.168.18.102',
port=3306,
db='lxk',
user='root',
password='123',
charset='utf8',
use_unicode=True
)
# 通过cursor执行增删查改
self.cursor = self.connect.cursor()
def process_item(self, item, spider):
self.cursor.execute(
# 纯属python操作mysql知识,不熟悉请恶补
"""insert into csdn(name, csdnurl, content, creattime, readnum) value (%s, %s, %s, %s,%s)""",
(item['name'].encode('utf8','ignore'), item['csdnUrl'].encode('utf8','ignore'), item['content'].encode('utf8','ignore'), item['creatTime'], item['readNum']))
# 提交事务
self.connect.commit()
# 必须实现返回
return item
4.settings.py
ITEM_PIPELINES = {
'csdn.pipelines.CsdnPipeline': 300,
}
5.运行文件start.py
#!/usr/bin/env python
# -*- coding:utf-8 -*-
from scrapy import cmdline
cmdline.execute("scrapy crawl csdnspider".split())
验证结果如下: