scrapy parse中存在遍历,每次返回的数据都一样
parse中的数据每次遍历都是一样
返回的data数据中 title,publish_time,source_href,tag 每次遍历都是一样的
def parse(self, response, **kwargs):
item = TuiqiuspiderproItem()
# item = response.meta['item']
resp = response.json()
articles = resp['articles']
for article in articles:
if article['is_video'] is False and compTime(article['published_at']):
item['title'] = article['title']
item['publish_time'] = article['published_at']
item['source_href'] = article['share']
tag= resp['label']
data = scrapy.Request(url=article['share'], callback=self._article, meta={'item': item, 'tag':tag}, dont_filter=False)
data.meta['item'] = item
yield data
def _article(self, response):
item = response.meta['item']
tag = response.meta['tag']
soup = BeautifulSoup(response.text, 'lxml')
comments = soup.findAll(text=lambda text: isinstance(text, Comment))
[comment.extract() for comment in comments]
try:
article = soup.find("div", {"class": "news-left"})
tips = article.find('p', {"class": "tips"}).find_all('span')
if tips and len(tips) > 1:
item['author'] = tips[1].get_text()
else:
item['author'] = tips[0].get_text()
self.saveImages(article)
# del article_pc.find('div', {'class': 'con'}).find("div").attrs['style']
item['content'] = str(delattrs(article.find('div', {'class': 'con'}))).replace(
"data-src", "src")
except Exception as e:
pass
author = response.xpath('.//div[@class="news-left"]/p[@class="tips"]/span/text()').extract_first()
item['author'] = author
item['tag'] = tag
# item['content'] = response.xpath('.//div[@class="con"]').extract_first()
yield item