主要代码
import scrapy
from items import UniversityItem
class QianmuNewSpider(scrapy.Spider):
name = 'qianmu_new'
# 允许爬的域名内的url,比如qianmu.org,那么www.qianmu,org,mall.qianmu.org都能爬
allowed_domains = ['qianmu.org']
# 爬虫的入口地址,可以多些几个
start_urls = ['http://www.qianmu.org/ranking/1528.html']
# 当框架请求start_urls内的链接成功以后,就会调用该方法
def parse(self, response):
# 解析链接,并提取,extract返回的是一个列表,extract_first返回的是列表中的第一个
links = response.xpath("//div[@class='rankItem']//td[2]/a/@href").extract()
for link in links:
if not link.startswith("http://www.qianmu.org"):
link = "http://www.qianmu.org/" + link
# 让框架继续跟着这个链接,也就是说会再次发起请求
# 请求成功以后会调用指定的callback函数
yield response.follow(link, self.parse_university)
def parse_university(self, response):
"""处理大学详情页面"""
response = response.replace(body=response.text.replace("\t", "").replace("\r\n", ""))
item = UniversityItem()
data = {}
# 学校名
item["name"] = response.xpath("//div[@id='wikiContent']/h1/text()").extract_first()
# 信息
table = response.xpath("//div[@id='wikiContent']/div[@class='infobox']/table")
if table:
table = table[0]
keys = table.xpath(".//td[1]/p/text()").extract()
cols = table.xpath('.//td[2]')
# values = table.xpath(".//td[2]/p//text()").extract_first()
values = [' '.join(col.xpath('.//text()').extract_first()) for col in cols]
print(len(keys), len(values))
if len(keys) == len(values):
data.update(zip(keys, values))
print(data)
item["rank"] = data.get("排名")
item["country"] = data.get("国家")
item["state"] = data.get("州省")
item["city"] = data.get("城市")
item["undergraduate_num"] = data.get("本科生人数")
item["postgraduate_num"] = data.get("研究生人数")
item["website"] = data.get("网址")
yield item
items
import scrapy
class UniversityItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
name = scrapy.Field()
rank = scrapy.Field()
country = scrapy.Field()
state = scrapy.Field()
city = scrapy.Field()
undergraduate_num = scrapy.Field()
postgraduate_num = scrapy.Field()
website = scrapy.Field()
pipelines
import pymysql
import redis
from scrapy.exceptions import DropItem
# 保存在redis中
class RedisPipeline:
# 开始调用一次
def open_spider(self, spider):
self.redis = redis.Redis(host="host", password="password")
# 关闭调用一次
def close_spider(self, spider):
self.redis.close()
# 每产生一个调用一次
def process_item(self, item, spider):
if self.redis.sadd(spider.name, item['name']):
return item
raise DropItem
# 保存在mysql中
class MysqlPipeline:
# 开始调用一次
def open_spider(self, spider):
self.conn = pymysql.connect(
host="127.0.0.1",
port=3306,
db="spider",
user="jiang",
password="jiang",
charset="utf8"
)
self.cur = self.conn.cursor()
# 关闭调用一次
def close_spider(self, spider):
self.cur.close()
slice.conn.close()
# 每产生一个调用一次
def process_item(self, item, spider):
# keys = item.keys()
# values = list(item.values) # 是一个元组-->集合
keys, values = zip(*item.items())
sql = "insert into universities({0}) values({1})".format(
','.join(keys),
','.join(['%s']*len(keys))
)
self.cur.execute(sql, values)
self.conn.commit()
# 输出语句
print(self.cur._last_executed)
return item
配置settings
ITEM_PIPELINES = {
'qianmu.pipelines.MysqlPipeline': 301,
'qianmu.pipelines.RedisPipeline': 300,
}
entrypoint
程序入口
from scrapy import cmdline
cmdline.execute(["scrapy", "crawl", "qianmu_new"])
数据库表
create table `universities`(
`name` varchar(256) NOT NULL COMMENT '学校名称',
`rank` varchar(32) DEFAULT NULLCOMMENT '学校排名',
`country` varchar(128) DEFAULT NULL COMMENT '国家',
`state` varchar(128) DEFAULT NULL COMMENT '州省',
`city` varchar(128) DEFAULT NULL COMMENT '城市',
`undergraduate_num` varchar(128) DEFAULT NULL COMMENT '本科生人数',
`postgraduate_num` varchar(128) DEFAULT NULL COMMENT '研究生人数',
`website` varchar(128) DEFAULT NULL COMMENT '网站地址',
primary key(`name`)
)ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 comment="大学信息表";