先说好,本博客都是自己练手的,没有任何商业化什么的,如果要求删除请私聊,看到后会第一时间删掉,不要发律师函,谢谢,鸡你太美
import scrapy
from yangguang.items import YangguangItem
class YgSpider(scrapy.Spider):
name = 'yg'
allowed_domains = ['sun0769.com']
start_urls = ['http://wz.sun0769.com/index.php/question/questionType?type=4&page=0']
def parse(self, response):
tr_list = response.xpath('//div[@class="greyframe"]/table[2]//tr//tr')
for tr in tr_list:
item = YangguangItem()
item["title"] = tr.xpath('./td[2]/a[2]/@title').extract_first()
item["href"] = tr.xpath('./td[2]/a[2]/@href').extract_first()
item["publish_data"] = tr.xpath("./td[5]/text()").extract_first()
yield scrapy.Request(
item["href"],
callback = self.parse_href,
meta = {"item":item}
)
next_url = response.xpath('//a[text()='>']/@href').extract_first()
if next_url is not None:
yield scrapy.Request(
next_url,
callback=self.parse
)
def parse_href(self,response):
item = response.meta["item"]
item["content"] = response.xpath('//td[@class="txt16_3"]//text()').extract()
item["content_img"] = response.xpath('//td[@class="txt16_3"]//img/@src').extract()
item["content_img"] = ['http://wz.sun0769.com' + i for i in item["content_img"]]
yield item
import scrapy
class YangguangItem(scrapy.Item):
title = scrapy.Field()
href = scrapy.Field()
publish_data = scrapy.Field()
content_img = scrapy.Field()
content = scrapy.Field()
_id = scrapy.Field()
import re
class YangguangPipeline(object):
def process_item(self, item, spider):
item["content"] = self.process_content(item["content"])
return item
def process_content(self,content):
content = [re.sub(r"\r\n|\xa0|\t|\s*","",i) for i in content]
while '' in content:
content.remove('')
content = ''.join(content)
return content