话不多说,直接上源码,有问题请留言
import requests
import pickle
import zlib
from datetime import datetime,timedelta
from pymongo import MongoClient
from bson.binary import Binary
class Tieba_Spider(object):
def __init__(self,tieba_name):
'''
初始化必要参数,完成基础设置
:param tieba_name:
'''
self.name = tieba_name
self.url_base = 'http://tieba.baidu.com/f?kw=' + tieba_name + '&ie=utf-8&pn={}'
self.headers = {'User-Agent':'ABC'}
def make_url_lists(self):
'''
生成下载列表
:return: 下载列表
'''
return [self.url_base.format(i*50) for i in range(4)]
def download_url(self,url_str):
'''
使用request get方法下载指定页面
:param url_str:
:return:
'''
result = requests.get(url_str,headers=self.headers)
return result.content
def save_result(self,result,page_num):
'''
存储到数据库
:param result:
:param page_num:
:return:
'''
pass
def run(self):
'''
下载主线程,实现主要的下载逻辑
:return:
'''
url_lists = self.make_url_lists()
for url_str in url_lists:
result_str = self.download_url(url_str)
p_num = url_lists.index(url_str)+1
mongoc[p_num] = result_str
print(p_num in mongoc)
class MongoCache(object):
"""
数据库缓存
"""
# 设置时间间隔30天
def __init__(self,client=None,expires=timedelta(days=30)):
# 连接数据库
self.client = MongoClient(host="localhost",port=27017)
# 创建数据库
self.db = self.client.cache
# 加速查找设置索引 设置超时时间 如果达到expireAfterSeconds设置的超时时间,mongodb会把超时数据自动删除
self.db.webpage.create_index('timestamp',expireAfterSeconds=expires.total_seconds())
def __setitem__(self, key, value):
# 时间戳 把下载的网页通过字典形式存到数据库里 压缩数据
record = {"result":Binary(zlib.compress(pickle.dumps(value))),"timestamp":datetime.utcnow()}
# 第一次 upsert 检测如果有,修改,没有就插入新数据,利用网址做ID $set 内置函数表示覆盖原始数据
self.db.webpage.update_many({"_id":key},{'$set':record},upsert=True)
# 魔法方法 取数据 通过_id查找 如果查到就进行反序列化 解压缩
def __getitem__(self, item):
record = self.db.webpage.find_one({"_id":item})
if record:
return pickle.loads(zlib.decompress(record["result"]))
else:
raise KeyError(item + "does not exist") # 找不到抛出异常
# 检测网页是否在数据库里
def __contains__(self, item):
try:
self[item] # 这里会调用__getitem__方法
except KeyError:
return False # 捕获到keyError异常说明没找到相关数据,惨开33行抛出异常的条件
else:
return True # 找到相应数据说明数据库包含下载内容
# def clear(self):
# self.db.webpage.drop()
if __name__ == '__main__':
tieba_spider = Tieba_Spider('lol')
mongoc = MongoCache()
tieba_spider.run()