仅供个人学习,如有侵权联系删除
知乎网站分析
1、个人资料页面url为:https://www.zhihu.com/people/…
2、分析出需要爬取内容的xpath
创建爬虫
使用crawl模板生成spider
scrapy startproject pachong7
cd zhihu
scrapy genspider -t crawl zhihu zhihu.com
源码
仅供个人学习使用,禁止爬取个人隐私、商业信息等
items.py
定义需要爬取的字段
# -*- coding: utf-8 -*-
import scrapy
class Pachong7Item(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
# 名称
name = scrapy.Field()
# 介绍
intro = scrapy.Field()
# 工作行业
detail = scrapy.Field()
# 关注的对象数
following = scrapy.Field()
# 粉丝数
followers = scrapy.Field()
zhuhu.py
具体爬取规则
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from pachong7.items import Pachong7Item
from scrapy_redis.spiders import RedisCrawlSpider
class ZhihuSpider(RedisCrawlSpider):
name = 'zhihu'
allowed_domains = ['zhihu.com']
redis_key = 'ZhihuSpider:start_urls'
rules = (
Rule(LinkExtractor(allow=(['people/.*/following$',
'people/.*/followers$']),)),
Rule(LinkExtractor(allow=('www.zhihu.com/people/((?!/).)*$', )),
callback='parse_item', follow=True),
)
def parse_item(self, response):
item = Pachong7Item()
item['name'] = response.xpath("//*[@class='ProfileHeader-name']/text()").extract_first()
item['intro'] = response.xpath("//*[@class='ztext ProfileHeader-headline']/text()").extract_first()
item['detail'] = response.xpath("string(//*[@class='ProfileHeader-info'])").extract_first()
follow_list = response.xpath("//*[@class='NumberBoard-itemValue']/text()").extract()
if follow_list:
item['following'] = follow_list[0]
item['followers'] = follow_list[1]
#print("昵称:", item['name'])
return item
pipelines.py
爬取数据的存储
将数据存入mongodb数据库
# -*- coding: utf-8 -*-
import pymongo
from pymongo import MongoClient
class Pachong7Pipeline(object):
def open_spider(self, spider):
self.db = MongoClient('localhost', 27017).zhihu_db
self.collection = self.db.zhihu_collection2
def process_item(self, item, spider):
self.collection.insert_one(dict(item))
def close_spider(self, spider):
self.collection.close()
settings.py
设置自己的USER_AGENT;
设置下载延时;
开启管道文件:ITEM_PIPELINES = { 'pachong7.pipelines.Pachong7Pipeline': 300, }
# 设置scrapy_redis作为调度器
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
# 设置redis去重
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
# 设置redis连接信息
REDIS_URL = 'redis://localhost:6379'
# 设置redis队列为保存
SCHEDULER_PERSIST = True