1、在cdm窗口,创建一个爬虫的项目
scrapy startproject bnuzpjt
2、编写items.py文件
import scrapy
class BnuzpjtItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
#公告通知标题
title = scrapy.Field()
#公告通知链接
url = scrapy.Field()
3、编写pipilines.py文件
①存入txt
import codecs
import json
class BnuzpjtPipeline(object):
def __init__(self):
self.file = codecs.open(r"C:\Users\j\bnuzpjt\1.txt","wb",encoding = "utf-8")
def process_item(self, item, spider):
i = json.dumps(dict(item),ensure_ascii=False)
line = i+'\n'
#数据写入到jsou文件中
self.file.write(line)
return item
#def close_spider(self,spider):
#关闭mydata文件
self.file.close()
②存入数据库
import os
import pymysql
from scrapy.exceptions import DropItem
import sys
import importlib
importlib.reload(sys)
INDEX = 1
DATE = "2017-11-16"
class BnuzpjtPipeline(object):
def __init__(self):
try:
self.db = pymysql.connect(host="127.0.0.1", user="root", passwd="123456", port=3306, db="pythonsql", charset="utf8")
self.cursor = self.db.cursor()
print("Connect to db successfully!")
except:
print("Fail to connect to db!")
def process_item(self, item, spider):
global INDEX
global DATE
if item["url"]:
u = 0
for url in item['url']:
url = 'http://www.bnuz.edu.cn/'+item['url'][u]
param = (INDEX,url,DATE)
sql = "insert into bnuz (url_id,url,adate) values(%s,%s,%s)"
self.cursor.execute(sql, param)
u = u + 1
INDEX = INDEX + 1
else:
raise DropItem(item)
return item
def close_spider(self, spider):
self.db.commit()
self.db.close
print("Done")
4、编写settings.py文件
BOT_NAME = 'bnuzpjt'
SPIDER_MODULES = ['bnuzpjt.spiders']
NEWSPIDER_MODULE = 'bnuzpjt.spiders'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# Disable cookies (enabled by default)
COOKIES_ENABLED = False
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'bnuzpjt.pipelines.BnuzpjtPipeline': 300,
}
5、在爬虫项目对应的目录下建一个爬虫文件
具体操作:
① 在爬虫项目的根目录:shift+鼠标右击->在此处打开powershell窗口
scrapy genspider -t basic bnuzspd bnuz.edu.cn
② 修改spiders文件夹下的bnuzspd.py文件
# -*- coding: utf-8 -*-
import scrapy
from bnuzpjt.items import BnuzpjtItem
from scrapy.http import Request
class BnuzspdSpider(scrapy.Spider):
name = 'bnuzspd'
#允许的网域
allowed_domains = ['bnuz.edu.cn']
#注意这个网址,有可能有错误,需要查看一下与要爬取的网站的URL是否匹配
start_urls = ['http://www.bnuz.edu.cn/']
def parse(self, response):
item=BnuzpjtItem()
item["title"] = response.xpath("//a[contains(@title,'【公告通知】')]").extract()
item["url"] = response.xpath("//ul[@class='leftclick']/li/a/@href").extract()
yield item
6、调试与运行
scrapy crawl bnuzspd
若代码准确无误,则可直接运行,但是笔者在一开始编写代码的时候,出现了几个错误,可以给你们提供调试的思路。
①start_urls 要正确;
②一开始运行时,笔者输入的运行代码是:
scrapy crawl bnuzspd --nolog
加上了–nolog是不会显示报错信息的,就不方便调试了;
③注意文件路径在代码中表示时,斜线的方向;
④还有一个就是xpath的调试了,scrapy自带有scrapy shell可以帮助xpath的调试。
6、scrapy shell
① 在爬虫项目的根目录:shift+鼠标右击->在此处打开powershell窗口
scrapy shell http://www.bnuz.edu.cn
②输入xpath,若xpath正确,则系统会爬取对应的信息
In [1]: sel.xpath("//a[contains(@title,'【公告通知】')]").extract()
2017-11-17 09:58:07 [py.warnings] WARNING: shell:1: ScrapyDeprecationWarning: "sel" shortcut is deprecated. Use "response.xpath()", "response.css()" or "response.selector" instead
Out[1]:
['<a title="【公告通知】广东省2018届高校毕业生系列供需见面活动(珠海地区专场)邀请函" href="info/1035/3295.htm" target="_blank"><span class="lefttitle">【公告通知】广东省2018届高校毕业生系列供需见...</span></a>',
'<a title="【公告通知】关于开展“我的跨文化故事”主题征文活动的通知" href="info/1035/3348.htm" target="_blank"><span class="lefttitle">【公告通知】关于开展“我的跨文化故事”主题征...</span></a>',
'<a title="【公告通知】关于校内外勤工助学信息发布渠道的公告" href="info/1035/3347.htm" target="_blank"><span class="lefttitle">【公告通知】关于校内外勤工助学信息发布渠道的公告</span></a>',
'<a title="【公告通知】学校高压电缆抢修停电通知" href="info/1035/3338.htm" target="_blank"><span class="lefttitle">【公告通知】学校高压电缆抢修停电通知</span></a>',
'<a title="【公告通知】关于为我校老年教职员工(含家属)及“慢性病”患者免费开展健康体检活动的通知" href="info/1035/3339.htm" target="_blank"><span class="lefttitle">【公告通知】关于为我校老年教职员工(含家属)...</span></a>',
'<a title="【公告通知】关于开展“青春共话十九大”微党课大赛的通知" href="info/1035/3335.htm" target="_blank"><span class="lefttitle">【公告通知】关于开展“青春共话十九大”微党课...</span></a>',
'<a title="【公告通知】2017-2018学年国家助学金受助学生初审名单公示" href="info/1035/3332.htm" target="_blank"><span class="lefttitle">【公告通知】2017-2018学年国家助学金受助学生初...</span></a>',
'<a title="【公告通知】2016-2017学年学生评优结果公示" href="info/1035/3324.htm" target="_blank"><span class="lefttitle">【公告通知】2016-2017学年学生评优结果公示</span></a>']
In [3]: sel.xpath("//ul[@class='leftclick']/li/a/@href").extract()
2017-11-17 09:59:31 [py.warnings] WARNING: shell:1: ScrapyDeprecationWarning: "sel" shortcut is deprecated. Use "response.xpath()", "response.css()" or "response.selector" instead
Out[3]:
['info/1031/3350.htm',
'info/1031/3349.htm',
'info/1031/3334.htm',
'info/1031/3333.htm',
'info/1031/3328.htm',
'info/1031/3323.htm',
'info/1031/3316.htm',
'info/1031/3314.htm',
'info/1031/3313.htm',
'info/1031/3310.htm',
'info/1033/3330.htm',
'info/1033/3329.htm',
'info/1033/3327.htm',
'info/1033/3326.htm',
'info/1033/3325.htm',
'info/1033/3321.htm',
'info/1033/3320.htm',
'info/1033/3282.htm',
'info/1033/3281.htm',
'info/1033/3273.htm',
'info/1034/3336.htm',
'info/1034/3331.htm',
'info/1034/3312.htm',
'info/1034/3300.htm',
'info/1034/3303.htm',
'info/1034/3280.htm',
'info/1034/3267.htm',
'info/1034/3265.htm',
'info/1034/3261.htm',
'info/1034/3258.htm',
'info/1035/3295.htm',
'info/1035/3348.htm',
'info/1035/3347.htm',
'info/1035/3338.htm',
'info/1035/3339.htm',
'info/1035/3335.htm',
'info/1035/3332.htm',
'info/1035/3324.htm']
这是笔者根据网上的信息,自己做的总结,希望可以帮助想学习scrapy的新手。