# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from day12.items import WangyiItem
class WangyiSpider(CrawlSpider):
name = 'wangyi'
allowed_domains = ['163.com']
start_urls = ['http://163.com/']
rules = (
#LinkExtractor 获取需要的url的正则表达式
# callback 用此函数处理页面返回数据
# fllow true 继续寻找当前页面的url进行处理
# False 不在当前页面寻找url处理
Rule(LinkExtractor(allow=r'https:.*\.163\.com/.*\.html'), callback='parse_item', follow=False),
Rule(LinkExtractor(allow=r'163\.com'), follow=True),
)
def parse_item(self, response):
# i = {}
#i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()
#i['name'] = response.xpath('//div[@id="name"]').extract()
#i['description'] = response.xpath('//div[@id="description"]').extract()
# return i
title = response.xpath('//h1/text()').extract_first()
print(title)
content = response.xpath('//div[@class="post_text"]//p//text()').extract()
content = ''.join(content)
print(content)
item = WangyiItem()
item['title'] = title
content['content'] = content
return item