任务1:记住如何存储到Mysql、mongoDB数据库
'''
存储到Mysql
'''
import pymysql.cursors
class QuotePipeline(object):
def __init__(self):
self.connect = pymysql.connect(
host='localhost',
user='root',
password='',
database='quotes',
charset='utf8',
)
self.cursor = self.connect.cursor()
def process_item(self, item, spider):
item = dict(item)
table = 'quote'
keys = ','.join(item.keys())
values = ','.join(['%s'] * len(item))
sql = 'insert into {table}({keys}) values({values})'.format(table=table, keys=keys, values=values)
try:
if self.cursor.execute(sql, tuple(item.values())):
self.connect.commit()
print("Successful!")
except:
print("Failed!")
self.connect.rollback()
return item
def close_spider(self, spider):
self.cursor.close()
self.connect.close()
'''
存储到mongoDB
'''
import pymongo
class MongoPipeline(object):
# 表名字
collection = 'domo'
def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
@classmethod
# cls作为一个参数表示类本身
def from_crawler(cls, crawler):
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db=crawler.settings.get('MONGO_DB'),
)
def open_spider(self, spider):
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def process_item(self, item, spider):
# 插入到mongo数据库
self.db[self.collection].insert(dict(item))
return item
def close_spider(self, spider):
self.client.close()
任务2:爬取微博Ajax加载的数据
# url拼接
from urllib.parse import urlencode
# 去掉html标签
from pyquery import PyQuery as pq
# 请求
import requests
# 链接mongo
from pymongo import MongoClient
# 爬的太快大概36页的时候就会出现418,加点延迟吧
import time
# 连接
client = MongoClient()
# 指定数据库
db = client['weibo']
# 指定表
collection = db['weibo_domo2']
max_page = 100
# 存储到mongoDB
def save_to_mongo(result):
if collection.insert(result):
print("saved to mongo")
# https://m.weibo.cn/api/container/getIndex?containerid=1076032830678474&page=2
# 找到X-Requested-With: XMLHttpRequest的Ajax请求
# 基础url,之后利用urlencode进行拼接
base_url = 'https://m.weibo.cn/api/container/getIndex?'
#https://m.weibo.cn/api/container/getIndex?type=uid&value=1005052830678474&containerid=1005051005052830678474
headers = {
'host': 'm.weibo.cn',
# 手机端打开,查到链接,在解析
# 'Referer': 'https://m.weibo.cn/p/1005052830678474',
'Referer': 'https://m.weibo.cn/u/2202323951',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest',
}
def get_page(page):
params = {
'type':'uid',
'value': '2202323951',
# 'containerid': '1076032830678474',
'containerid': '1076032202323951',
'page': page,
}
url = base_url + urlencode(params)
print(url)
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
# response = json.dump(response.text)
return response.json(),page
except requests.ConnectionError as e:
print('Error', e.args)
def parse_page(json,page:int):
if json:
# 只需要data下的cards内的数据
items = json.get('data').get('cards')
# index 下标
for index,item in enumerate(items):
# 在第一页,index==1没有mblog,只有这个没用,所以直接循环会导则索引报错
# 跳过这段
if index==1 and page==1:
continue
else:
item = item.get('mblog')
weibo = {}
# 微博ID
# "id":"4349509976406880",
weibo['ID'] = item.get('id')
# 微博内容 使用pq去掉html标签
weibo['text'] = pq(item.get('text')).text()
# 发表所用手机
weibo['phone'] = item.get('source')
# 发表时间
weibo['time'] = item.get('edit_at')
# 赞数量 attitudes:态度,意思,姿态
weibo['attitudes'] = item.get('attitudes_count')
# 评论数 comment:评论
weibo['comments'] = item.get('comments_count')
# 转发数 repost:转帖
weibo['reposts'] = item.get('reposts_count')
yield weibo
if __name__ == '__main__':
for page in range(1, max_page + 1):
json = get_page(page)
# *json==*args 将返回的json和page传入
results = parse_page(*json)
time.sleep(3)
for result in results:
print(result)
save_to_mongo(result)
总结:
1.不加延迟爬到36-38页会出现418 (418 I’m a teapot 服务器拒绝尝试用 “茶壶冲泡咖啡”。)
2. Ajax请求中可能在中间出现不是你想要的数据,例如微博page1,index1代表的是关注列表,关注的信息,不是你想要的数据
3.使用手机端获取Ajax数据,比在PC端,容易很多.
4.启动mongo需要先指定dbpath(数据存储的地方),查询插入文件的数量
形如:mongod --dbpath="F:\MongoDB\Server\3.4\data"
形如: db.weibo_domo2.find().count()
5.最终爬取出了朱子奇的所有微博,一共848条,web端显示一共894条,去掉文章48条,去掉一条自己舍弃的,刚好848条(成功!)
任务三:理解进程池,os模块,网页端Ajax请求的拼接,MD5
# 拼接URL
from urllib.parse import urlencode
# 请求URL
import requests
# 文件操作
import os
# md5:类似加密,不会重复
from hashlib import md5
# 进程池
from multiprocessing.pool import Pool
# 延迟
import time
base_url = 'https://www.toutiao.com/api/search/content/?'
headers = {
'Referer': 'https://www.toutiao.com/search/?keyword=%E8%A1%97%E6%8B%8D',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest',
}
def get_page(offset):
#https://www.toutiao.com/api/search/content/?aid=24&app_name=web_search&offset=0&format=json&keyword=%E8%A1%97%E6%8B%8D&autoload=true&count=20&en_qc=1&cur_tab=1&from=search_tab&pd=synthesis
#根据链接传入params,offset是变化的
params = {
'aid':'24',
'app_name':'web_search',
'offset':offset,
'format':'json',
'keyword':'街拍',
'autoload':'ture',
'count':'20',
'en_qc':'1',
'cur_tab':'1',
'from':'search_tab',
'pd':'synthesis',
}
url = base_url + urlencode(params)
# 返回json格式的数据
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.json()
except requests.ConnectionError as e:
print('Error', e.args)
def get_images(json):
if json:
items = json.get('data')
for item in items:
# 标题
title = item.get('title')
# 图片列表
images = item.get('image_list')
for image in images:
# 返回单个图片链接+标题的字典
yield {
'image':image.get('url'),
'title':title,
}
def save_image(item):
# 改变当前工作目录
os.chdir('F:\\domo')
# 如果没有item传过来title命名的文件,就创建一个
if not os.path.exists(item.get('title')):
os.mkdir(item.get('title'))
try:
# 请求图片URL
response = requests.get(item.get('image'))
if response.status_code == 200:
# 构造图片名字
file_path = '{0}\\{1}.{2}'.format(item.get('title'),md5(response.content).hexdigest(),'jpg')
# 如果不存在这张图片就以二进制方式写入
if not os.path.exists(file_path):
with open(file_path,'wb') as f:
f.write(response.content)
else:
print("已经下载过这个文件了",file_path)
except:
print("图片下载失败")
GROUP_START = 1
GROUP_END = 20
def main(offset):
json = get_page(offset)
for item in get_images(json):
print(item)
save_image(item)
if __name__ == '__main__':
pool = Pool()
# 构造一个offset列表 20-400(20页)
groups = ([x * 20 for x in range(GROUP_START,GROUP_END + 1)])
# 多进程运行main函数
pool.map(main,groups)
# 关闭进程池
pool.close()
# 等待还没运行完的进程
pool.join()
总结:1.os模块的基本操作
os.chdir('路径') --------------------表示改变当前工作目录到路径
os.path.exists('文件名') ------------当前目录下是否存在该文件,存在返回Ture,不存在返回False
os.mkdir()-----------创建文件夹
2. 用MD5值命名文件,可以有效的解决重复抓取的问题
3.进程池能大大降低爬取时间