创建一个scrapy项目:
scrapy startproject + 项目名
(myworld) [root@web rxzWorld]# scrapy startproject tianya
New Scrapy project 'tianya', using template directory '/root/.virtualenvs/myworld/lib/python3.6/site-packages/scrapy/templates/project', created in:
/home/rxzWorld/tianya
You can start your first spider with:
cd tianya
scrapy genspider example example.com
提示告诉了你项目创建的绝对位置及创建爬虫的语句,查看项目结构:
创建一个爬虫:scrapy genspider 爬虫名 + 爬取范围
(myworld) [root@web tianya]# scrapy genspider TYspider http://bbs.tianya.cn
Created spider 'TYspider' using template 'basic' in module:
tianya.spiders.TYspider
打开爬虫文件,编辑自己的爬虫
(myworld) [root@web tianya]# tree
.
├── __init__.py
├── items.py
├── middlewares.py
├── pipelines.py
├── __pycache__
│ ├── __init__.cpython-36.pyc
│ └── settings.cpython-36.pyc
├── settings.py
└── spiders
├── __init__.py
├── __pycache__
│ └── __init__.cpython-36.pyc
└── TYspider.py # 自定义spider
class TyspiderSpider(scrapy.Spider):
name = 'TYspider' # 爬虫名
allowed_domains = ['http://bbs.tianya.cn'] # 爬取范围
start_urls = ['http://bbs.tianya.cn/list-lookout-1.shtml'] # 起始地址
设置item预存字段:
# item文件中
class TianyaItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field() # 以下是主贴字段
bbs_content = scrapy.Field() # 帖子内容
post_author_url = scrapy.Field() # 发帖作者主页
post_author_uid = scrapy.Field() # 发帖作者唯一ID
post_nickname = scrapy.Field() # 发帖作者昵称
home_post_time = scrapy.Field() # 发帖时间
home_clink_num = scrapy.Field() # 点击数
home_reply = scrapy.Field() # 回复数
home_artid = scrapy.Field() # 文章ID
post_like_num = scrapy.Field() # 帖子获赞数
title = scrapy.Field() # 帖子标题
category_name = scrapy.Field() # 分类名
category_url = scrapy.Field() # 分类URL
sub_floor = scrapy.Field() # 楼数据
_id = scrapy.Field() # mongo主键
# scrapy插入MongoDB数据库时自动生成_id
parse函数:函数名称不能更改
def parse(self, response): # 获取所有大类url
etree_obj = etree.HTML(response.text) # xpath对象
classify_url_list = etree_obj.xpath('//div[@class="nav_child_box"]/ul/li/a/@href') # 类别url列表
for classify_url in classify_url_list:
classify_url = 'http://bbs.tianya.cn/' + classify_url # 拼接完整url
yield scrapy.Request( # yield 使函数成为生成器,避免占用大量内存
classify_url, # 像调度器队列传递url
callback=self.get_one_page # 指定处理响应函数
)
自定义方法,获取单一内容及下一页
def get_one_page(self, response):
etree_obj = etree.HTML(response.text) # 和上面一样
forum_post_url_list = etree_obj.xpath('//td[@class="td-title faceblue"]/a/@href') # 发布的帖子的url列表
for forum_post_url in forum_post_url_list:
forum_post_url = 'http://bbs.tianya.cn/' + forum_post_url # 拼接
yield scrapy.Request(
forum_post_url,
callback=self.get_data
)
next_page = etree_obj.xpath('//div[@class="short-pages-2 clearfix"]/div/a[@rel="nofollow"]/@href') # 下一页
if next_page: # 如果下一页存在
next_page = 'http://bbs.tianya.cn/' + next_page[0] # 拼接
yield scrapy.Request(
next_page,
callback=self.get_one_page
)
自定义获取数据方法
数据格式以字典形式插入MongoDB
item={
title:"", # 标题
分类名:"",
分类URL:"",
homeURL:"", # 楼主主页
uid:"", # 用户唯一标识,根据UID能找到用户发表的所有帖子
uname:"", # 用户名
post_time:"", # 发帖时间
clink_num:"", # 点击数
like_num:"", # 点赞数
bbs_content:"", # 帖子内容
sub_floor:{
{
replyid:"",
homeURL:"", # 楼主主页
uid:"", # 用户唯一标识,根据UID能找到用户发表的所有帖子
uname:"", # 用户名
post_time:"", # 发帖时间
bbs_content:"", # 帖子内容
like_num:"", # 点赞数
comment:{
_rid:"",
uid:"", # 用户唯一标识,根据UID能找到用户发表的所有帖子
uname:"", # 用户名
reply_content:"", # 回复内容
reply_time # 回复时间
}
},
}
}
def get_data(self,response):
item = TianyaItem() # 生成item对象
soup_obj = BeautifulSoup(response.text, 'lxml') # beautifulsoup对象
all_data_div_list = soup_obj.findAll('div', {'class': 'atl-item'}) # 数据处理
post_div = all_data_div_list.pop(0)
post_div_etree_obj = etree.HTML(str(post_div))
try:
item["bbs_content"] = [content.strip() for content in post_div_etree_obj.xpath('//div[@class="bbs-content clearfix"]/text()') if content.strip()]
except:
item["bbs_content"] = ''
try:
item["post_author_url"] = soup_obj.find('div',{'class': "atl-info"}).span.a["href"]
except:
print(response.url, 'post_author_url')
try:
item["post_author_uid"] = soup_obj.find('div',{'class': "atl-info"}).span.a["uid"]
except:
print(response.url, 'post_author_url')
try:
span_list = [span.text for span in soup_obj.find('div',{'class': "atl-info"}).findAll('span')]
except:
print(response.url, 'span_list')
try:
item["post_nickname"] = span_list[0]
except:
item["post_nickname"] = ''
try:
item["home_post_time"] = span_list[1]
except:
item["home_post_time"] = ''
try:
item["home_clink_num"] = span_list[2]
except:
item["home_clink_num"] = ''
try:
item["home_reply"] = span_list[3]
except:
item["home_reply"] = ''
try:
item["home_artid"] = re.search('artId.*?:(.*?),',response.text,re.S).group(1)
except:
item["home_artid"] = ''
post_like_num = post_div_etree_obj.xpath('//div[@class="action-tyf"]/div[@class="action-tyf-zan"]/p/em/span/text()')
try:
item["post_like_num"] = post_like_num[0].strip()
except:
item["post_like_num"] = ''
try:
item['title'] = soup_obj.find('h1', {'class':"atl-title"}).span.text
except:
item['title'] = ''
try:
item['category_name'] = soup_obj.find('div',{'class':"atl-location clearfix"}).p.em.a.text
except:
item['category_name'] = ''
try:
item['category_url'] = 'http://bbs.tianya.cn/' + soup_obj.find('div',{'class':"atl-location clearfix"}).p.em.a['href']
except:
item['category_url'] = ''
sub_floors = []
sub_floor = {}
comment = {}
for the_all_data_div in all_data_div_list:
data_etree_obj = etree.HTML(str(the_all_data_div))
sub_floor["replyid"] = data_etree_obj.xpath('//div[@class="atl-item"]/@replyid')
sub_floor["homeURL"] = data_etree_obj.xpath('//div[@class="atl-item"]/div/div[2]/span/a/@href')
sub_floor["uid"] = data_etree_obj.xpath('//div[@class="atl-item"]/div/div[2]/span/a/@uid')
sub_floor["uname"] = data_etree_obj.xpath('//div[@class="atl-item"]/div/div[2]/span/a/@uname')
sub_floor["post_time"] = data_etree_obj.xpath('//div[@class="atl-item"]/div/div[2]/span[2]/text()')
sub_floor["content"] = [content.strip() for content in data_etree_obj.xpath('//div[@class="bbs-content"]/text()') if content.strip()]
sub_floor["like_num"] = data_etree_obj.xpath('//div[@class="atl-reply"]/span[@class="tuijian"]/a/@_count')
replys = the_all_data_div.findAll('div', {'class':"item-reply-view"})
if replys:
for reply in replys:
reply_li = reply.findAll('li')
if reply_li:
comment_num = 0
for reply_info in reply_li:
if reply_info.find('span', {'class':"ir-content"}):
comment["content"] = reply_info.find('span', {'class':"ir-content"}).text
else:
comment["content"] = ''
if reply_info.find('p'):
comment["url"] = reply_info.find('p').a['href']
else:
comment["url"] = ''
if reply_info['_userid']:
comment["uid"] = reply_info['_userid']
else:
comment["uid"] = ''
if reply_info['_username']:
comment["username"] = reply_info['_username']
else:
comment["username"] = ''
if re.search('<span class="ir-power">.*?<span>(.*?)</span>', str(reply_info),re.S):
comment["post_time"] = re.search('<span class="ir-power">.*?<span>(.*?)</span>', str(reply_info),re.S).group(1)
else:
comment["post_time"] = ''
sub_floor['comment%s'%comment_num] = comment
comment_num += 1
item['sub_floor'] = sub_floor
yield item
将数据传入了管道pipeline,使用pipeline
# 将setting文件中管道的注释打开
ITEM_PIPELINES = {
'tianya.pipelines.TianyaPipeline': 300, # 数字代表优先级
}
# 存在open_spider,close_spider两个方法,当启动是与当结束时,只触发一次
class TianyaPipeline(object):
def open_spider(self, spider):
self.client = MongoClient()
self.db = self.client.odata
def close_spider(self, spider):
self.client.close()
def process_item(self, item, spider):
print(item) # 接收item