爬取如下:
步骤:
爬虫代码:
import scrapy
class QiubaiSpider(scrapy.Spider):
name = 'qiubai'
# allowed_domains = ['www.xxx.com']
start_urls = ['https://www.qiushibaike.com/text/']
def parse(self, response):
# 解析作者名称、段子内容
div_list = response.xpath('//div[@class="col1 old-style-col1"]/div')
for div in div_list:
author = div.xpath('./div[1]/a[2]/h2/text()')[0]
content = div.xpath('./a[1]/div/span//text()')
# 仅打印第一个作者和段子,查看返回内容
print(author, content)
break
修改配置文件:
查看结果:
修改代码:
import scrapy
class QiubaiSpider(scrapy.Spider):
name = 'qiubai'
# allowed_domains = ['www.xxx.com']
start_urls = ['https://www.qiushibaike.com/text/']
def parse(self, response):
# 解析作者名称、段子内容
div_list = response.xpath('//div[@class="col1 old-style-col1"]/div')
for div in div_list:
# extract()可以将selector对象中的data参数存储的字符串提取出来
author = div.xpath('./div[1]/a[2]/h2/text()')[0].extract()
# 列表调用extract()之后,将列表中的每一个selector对象中data对应字符串提取出来
content = div.xpath('./a[1]/div/span//text()').extract()
# 仅打印第一个作者和段子,查看返回内容
print(author, content)
break
结果:
代码修改:
import scrapy
class QiubaiSpider(scrapy.Spider):
name = 'qiubai'
# allowed_domains = ['www.xxx.com']
start_urls = ['https://www.qiushibaike.com/text/']
def parse(self, response):
# 解析作者名称、段子内容
div_list = response.xpath('//div[@class="col1 old-style-col1"]/div')
for div in div_list:
# extract()可以将selector对象中的data参数存储的字符串提取出来
author = div.xpath('./div[1]/a[2]/h2/text()')[0].extract()
# 列表调用extract()之后,将列表中的每一个selector对象中data对应字符串提取出来
content = div.xpath('./a[1]/div/span//text()').extract()
# 将列表转成字符串
content = ''.join(content)
# 仅打印第一个作者和段子,查看返回内容
print(author, content)
break
结果: