Scrapy项目之自动爬取网页信息一文介绍了利用Scrapy框架可以方便地进行网页信息的自动爬取。本文则是基于前文内容,将前文所述爬取的信息存储到MySQL中,使网页信息结构化。
1.创建爬虫项目crawltosql
2.编写items.py
import scrapy
class CrawltosqlItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
# 定义帖子的链接
url = scrapy.Field()
# 定义帖子的标题
title = scrapy.Field()
# 定义帖子的作者
author = scrapy.Field()
# 定义作者信息的链接
authorlink = scrapy.Field()
# 定义帖子回复
reply = scrapy.Field()
# 定义浏览情况
scan = scrapy.Field()
# 定义发布时间
pubtime = scrapy.Field()
# 定义最后回复时间
lastreplytime = scrapy.Field()
# 定义最后回复的作者
endauthor = scrapy.Field()
3.编写pipelines.py
在编写pipelines.py之前先创建一个数据库mydata,并创建表data
CREATE TABLE `data` (
`url` varchar(2083) DEFAULT NULL,
`title` varchar(2083) DEFAULT NULL,
`author` varchar(2083) DEFAULT NULL,
`authorlink` varchar(2083) DEFAULT NULL,
`reply` varchar(45) DEFAULT NULL,
`scan` varchar(45) DEFAULT NULL,
`pubtime` varchar(45) DEFAULT NULL,
`lastreplytime` varchar(45) DEFAULT NULL,
`endauthor` varchar(450) DEFAULT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8
创建之后,表如下所示:
然后,编写pipelines.py,获取需要的信息,并存储进mysql数据表。
import pymysql
class CrawltosqlPipeline(object):
def __init__(self):
self.conn = pymysql.connect(host="127.0.0.1",user="root",passwd="root",db="mydata")
def process_item(self, item, spider):
mylen = min(len(item["url"]),len(item["title"]),len(item["author"]),len(item["reply"]),len(item["scan"]),len(item["pubtime"]),len(item["lastreplytime"]),len(item["endauthor"]))
for j in range(mylen):
url = "https://bbs.hupu.com"+item["url"][j]
title=item["title"][j]
author=item["author"][j]
authorlink=item["authorlink"][j]
reply=item["reply"][j]
scan=item["scan"][j]
pubtime=item["pubtime"][j]
lastreplytime=item["lastreplytime"][j]
endauthor=item["endauthor"][j]
#构造对应的sql语句,实现获取列的对应的数据插入到数据库中
sql="insert into data(url,title,author,authorlink,reply,scan,pubtime,lastreplytime,endauthor) VALUES('"+url+"','"+title+"','"+author+"','"+authorlink+"','"+reply+"','"+scan+"','"+pubtime+"','"+lastreplytime+"','"+endauthor+"')"
#通过query实现执行对应的sql语句
self.conn.query(sql)
self.conn.commit()
return item
def close_spider(self,spider):
self.conn.close()
4.编写settings.py
# -*- coding: utf-8 -*-
# Scrapy settings for crawltosql project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'crawltosql'
SPIDER_MODULES = ['crawltosql.spiders']
NEWSPIDER_MODULE = 'crawltosql.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'crawltosql (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'crawltosql.middlewares.CrawltosqlSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'crawltosql.middlewares.CrawltosqlDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'crawltosql.pipelines.CrawltosqlPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
5.编写爬虫文件crawl.py
#先使用如下命令创建爬虫文件
scrapy genspider -t basic crawl hupu.com
爬虫文件具体如下:
import scrapy
from scrapy import Request
from crawltosql.items import CrawltosqlItem
class CrawlSpider(scrapy.Spider):
name = 'crawl'
allowed_domains = ['hupu.com']
start_urls = ['https://bbs.hupu.com/vote']
def parse(self, response):
item = CrawltosqlItem()
item["url"]=response.xpath("//a[@class='truetit']/@href").extract()
item["title"]=response.xpath("//a[@class='truetit']/text()").extract()
item["author"]=response.xpath("//a[@class='aulink']/text()").extract()
item["authorlink"]=response.xpath("//a[@class='aulink']/@href").extract()
#item["replyandscan"]
#由于replyandscan元素中包含'\xa0',会出现报错,所以需要单独处理,将浏览情况与回复情况拆分
t=response.xpath("//span[@class='ansour box']/text()").extract()
reply=[]
scan=[]
for u in t:
tmp=str(u).replace("\xa0","").split("/")
if(tmp!=[]):
reply.append(tmp[0])
scan.append(tmp[1])
item["reply"]=reply
item["scan"]=scan
item["pubtime"]=response.xpath("//div[@class='author box']//a[@style='color:#808080;cursor: initial; ']/text()").extract()
item["lastreplytime"]=response.xpath("//div[@class='endreply box']/a/text()").extract()
item["endauthor"]=response.xpath("//div[@class='endreply box']/span/text()").extract()
yield item
for i in range(2,21):
url="https://bbs.hupu.com/vote-"+str(i)
#通过yield返回Request,并指定要爬取的url和回调函数,从而实现自动爬取
yield Request(url,callback=self.parse)
6.运行爬虫文件
通过如下命令执行爬虫文件
scrapy crawl crawl --nolog
然后查看数据库相应的表,结果如下图:
如图是data表中部分数据,这就是爬虫的信息。
关于本文涉及代码可以见github:https://github.com/carson0408/AutoCrawler.git ,其中的crawltosql即为本文的代码