def parse(self, response):
groups = response.xpath('//*[@id="index"]/div[4]/div/div[1]/article')
for group in groups:
item = GeekParkItem()
item['title'] = group.xpath('div[1]/a[2]/h3/text()').extract_first()
item['report'] = group.xpath('div[1]/p/text()').extract_first()
item['author'] = group.xpath('div[2]/a/text()').extract_first()
item['posttime'] = group.xpath('div[1]/div/text()').extract_first()
#print(item['title'], item['report'], item['author'], item['posttime'])
item['link'] = group.xpath('div[1]/a[2]/@href').extract_first()
url = response.urljoin(item['link'])
yield scrapy.Request(url, callback=self.parse_article)
def parse_article(self, response):
item = GeekParkItem()
detail = response.xpath('//div[@class="main-wrap"]/article')
for paper in detail:
item['title'] = paper.xpath('header[@class="post-header"]/h1/text()').extract_first()
item['author'] = paper.xpath('header[@class="post-header"]/div[@class="user-info"]/a/span/text()').extract_first()
item['posttime'] = paper.xpath('header[@class="post-header"]/div[@class="user-info"]/span/text()').extract_first()
text_list = paper.xpath('div[@id="article-body"]/div/p/text()').extract()
item['text'] = "".join(text_list).strip()
print(item)
yield item