前言
本节学习scrapy
包括数据分析、下载器中间件、爬虫中间件、管道
1、pandas数据分析
延续上一节的例子
感受下数据分析
详细的可学pandas cookbook
笔者之后会去翻下这本书
数据统计info.py
# encoding: utf-8
import pandas as pd
# 租房 基本信息
# 读取文件 df=dataframe
df = pd.read_json("zufang.json")
print(df)
print(df.columns)
# 使用pandas的describe方法,打印基本信息
print(df.describe())
# 按照区,分别统计个数
print(df["district"].value_counts())
# 二手房 基本信息
df = pd.read_json("ershoufang.json")
print(df.describe())
#分别统计个数
print(df["district"].value_counts())
饼图pie_chart.py
import numpy as np
import pandas as pd
import json
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
from pylab import *
mpl.rcParams['font.sans-serif'] = ['SimHei']
myfont = FontProperties(fname='/Users/seancheney/.matplotlib/mpl-data/fonts/ttf/SimHei.ttf')
labels = '朝阳', '海淀', '昌平', '东城', '大兴', '西城', '丰台', '石景山', '通州', '顺义'
df_zf = pd.read_json("ershoufang.json")
chaoyang_count = df_zf['district'].value_counts()['朝阳']
haidian_count = df_zf['district'].value_counts()['海淀']
changping_count = df_zf['district'].value_counts()['昌平']
dongcheng_count = df_zf['district'].value_counts()['东城']
daxing_count = df_zf['district'].value_counts()['大兴']
xicheng_count = df_zf['district'].value_counts()['西城']
fengtai_count = df_zf['district'].value_counts()['丰台']
shijingshan_count = df_zf['district'].value_counts()['石景山']
tongzhou_count = df_zf['district'].value_counts()['通州']
shunyi_count = df_zf['district'].value_counts()['顺义']
sizes = [chaoyang_count, haidian_count, changping_count, dongcheng_count, daxing_count, xicheng_count, fengtai_count, shijingshan_count, tongzhou_count, shunyi_count]
explode = (0.1, 0, 0, 0,0,0,0,0,0,0)
plt.subplot(121)
plt.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%',
shadow=True, startangle=-90)
plt.axis('equal')
plt.title("房屋出售分布", fontproperties=myfont)
plt.rc('font',family=['SimHei'])
plt.show()
柱状图hist.py
import numpy as np
import pandas as pd
import json
import matplotlib.pyplot as plt
from pylab import *
mpl.rcParams['font.sans-serif'] = ['SimHei']
df = pd.read_json("ershoufang.json")
print(df.columns)
unitprice_values = df.unitprice
plt.hist(unitprice_values,
bins=25
)
plt.xlim(0,200000)
plt.title(u"房屋出售每平米价格分布")
plt.xlabel(u'价格(单位:万/平方米)')
plt.ylabel(u'套数')
plt.show()
售租比ratio.py
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pylab import *
mpl.rcParams['font.sans-serif'] = ['SimHei']
district = ('西城', '石景山','东城','海淀','丰台','昌平','大兴','朝阳', '通州')
# 读取租房数据
df_zf = pd.read_json("zufang.json")
unitprice_zf = df_zf['price']/df_zf['area']
df_zf['unitprice'] = unitprice_zf
print(df_zf)
month_price = df_zf.groupby(by=['district']).sum()['unitprice'] / df_zf["district"].value_counts()
print(month_price)
# 读取二手房数据
df_esf = pd.read_json("ershoufang.json")
sell_price = df_esf.groupby(by=['district']).sum()['unitprice']/df_esf["district"].value_counts()
print(sell_price)
xicheng_ratio = sell_price['西城'] / month_price['西城']
shijingshan_ratio = sell_price['石景山'] / month_price['石景山']
dongcheng_ratio = sell_price['东城'] / month_price['东城']
haidian_ratio = sell_price['海淀'] / month_price['海淀']
fengtai_ratio = sell_price['丰台'] / month_price['丰台']
changping_ratio = sell_price['昌平'] / month_price['昌平']
daxing_ratio = sell_price['大兴'] / month_price['大兴']
chaoyang_ratio = sell_price['朝阳'] / month_price['朝阳']
tongzhou_ratio = sell_price['通州'] / month_price['通州']
ratio = (
xicheng_ratio,
shijingshan_ratio,
dongcheng_ratio,
haidian_ratio,
fengtai_ratio,
changping_ratio,
daxing_ratio,
chaoyang_ratio,
tongzhou_ratio
)
fig, ax = plt.subplots()
y_pos = np.arange(len(district))
ax.barh(y_pos, ratio, align='center', color='green', ecolor='black')
ax.set_yticks(y_pos)
ax.set_yticklabels(district)
# ax.invert_yaxis()
ax.set_xlabel('售租比(单位:月)')
ax.set_title('各区房屋售租比')
plt.show()
2、下载器中间件
下载器中间件按照优先级被调用的:
- 当request从引擎向下载器传递时,数字小的下载器中间件先执行,数字大的后执行
- 当下载器将response向引擎传递,数字大的下载器中间件先执行,小的后执行
scrapy提供了一套基本的下载器中间件
{
'scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware': 100,
'scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware': 300,
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware': 350,
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware': 400,
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': 500,
'scrapy.downloadermiddlewares.retry.RetryMiddleware': 550,
'scrapy.downloadermiddlewares.ajaxcrawl.AjaxCrawlMiddleware': 560,
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware': 580,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 590,
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware': 600,
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware': 700,
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 750,
'scrapy.downloadermiddlewares.stats.DownloaderStats': 850,
'scrapy.downloadermiddlewares.httpcache.HttpCacheMiddleware': 900,
}
下载器中间件是个类,类里可以定义方法
例如process_request(),process_response(),process_exception()
process_request():
- 参数是request, spider
- 参数request是个字典,字典里包含了headers、url等信息
- process_request()可以利用参数request里面的信息,对请求做修改,这时一般返回的是None,典型的任务是修改User-agent、变换代理
- 如果根据参数request里的url直接就去做抓取,返回response对象,返回的response对象就会不经过剩下的下载器中间件,直接返回到引擎
- 如果对请求做了修改,返回的是request对象,就会发回到调度器,等待调度
process_response(request, response, spider):
- 返回的必须是Response、Request或IgnoreRequest异常
3、爬虫中间件
爬虫中间件的作用:
- 处理引擎传递给爬虫的响应
- 处理爬虫传递给引擎的请求
- 处理爬虫传递给引擎的数据项
4、管道
每个管道组件都是一个实现了某个功能的Python类,常见功能有:
- 清理html数据
- 做确认
- 查重
- 存入数据库
每个管道组件的类,必须要有以下方法:
- process_item(self, item, spider)
- open_spider(self, spider)
- close_spider(self, spider)
- from_crawler(cls, crawler)
一些操作如下
# 丢弃数据项
from scrapy.exceptions import DropItem
class PricePipeline(object):
vat_factor = 1.15
def process_item(self, item, spider):
if item['price']:
if item['price_excludes_vat']:
item['price'] = item['price'] * self.vat_factor
return item
else:
raise DropItem("Missing price in %s" % item)
# 存储到MongoDB
import pymongo
class MongoPipeline(object):
collection_name = 'scrapy_items'
def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
@classmethod
def from_crawler(cls, crawler):
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db=crawler.settings.get('MONGO_DATABASE', 'items')
)
def open_spider(self, spider):
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def close_spider(self, spider):
self.client.close()
def process_item(self, item, spider):
self.db[self.collection_name].insert_one(dict(item))
return item
# 存储到MySQL
class MysqlPipeline():
def __init__(self, host, database, user, password, port):
self.host = host
self.database = database
self.user = user
self.password = password
self.port = port
@classmethod
def from_crawler(cls, crawler):
return cls(
host=crawler.settings.get('MYSQL_HOST'),
database=crawler.settings.get('MYSQL_DATABASE'),
user=crawler.settings.get('MYSQL_USER'),
password=crawler.settings.get('MYSQL_PASSWORD'),
port=crawler.settings.get('MYSQL_PORT'),
)
def open_spider(self, spider):
self.db = pymysql.connect(self.host, self.user, self.password, self.database, charset='utf8',
port=self.port)
self.cursor = self.db.cursor()
def close_spider(self, spider):
self.db.close()
def process_item(self, item, spider):
print(item['title'])
data = dict(item)
keys = ', '.join(data.keys())
values = ', '.join(['%s'] * len(data))
sql = 'insert into %s (%s) values (%s)' % (item.table, keys, values)
self.cursor.execute(sql, tuple(data.values()))
self.db.commit()
return item
# 去重
from scrapy.exceptions import DropItem
class DuplicatesPipeline(object):
def __init__(self):
self.ids_seen = set()
def process_item(self, item, spider):
if item['id'] in self.ids_seen:
raise DropItem("Duplicate item found: %s" % item)
else:
self.ids_seen.add(item['id'])
return item
# 激活管道
ITEM_PIPELINES = {
'myproject.pipelines.PricePipeline': 300,
'myproject.pipelines.JsonWriterPipeline': 800,
}
结语
对scrapy做了进一步的了解