def parse(self, response: HtmlResponse):
content = response.xpath('//div[@class="XXX"]//text()')
art_content = ''.join(content.extract()).strip()
yield {'art_content': art_content}
用newspaper3k一样可以保留原格式
def parse(self, response: HtmlResponse):
content = response.xpath('//div[@class="XXX"]//text()')
art_content = ''.join(content.extract()).strip()
yield {'art_content': art_content}
用newspaper3k一样可以保留原格式