获取信息
在items.py里对元数据初始化
import scrapy
class ZhengfuItem(scrapy.Item):
name = scrapy.Field()
website = scrapy.Field()
url = scrapy.Field()
type = scrapy.Field()
info = scrapy.Field()
subtype = scrapy.Field()
page = scrapy.Field()
content = scrapy.Field()
scrapy.Field():以字典形式初始化
def get_info(self, response):
item = ZhengfuItem()
item['name'] = str(data['name'])
item['website'] = str(data['website'])
item['url'] = str(data['url'])
item['type'] = str(data['type'])
item['info'] = str(data['info'])
item['subtype'] = str(data['subtype'])
item['page'] = str(data['page'])
item['content'] = str(data['content'])
yield item
为元数据赋值
获取网站配置文件对象
def parse_json(self, response) :
config_obj = response.meta['config_obj']
titles_xpath = config_obj['titles']
# addtimes_xpath = config_obj['title_html']
datas = json.loads(response.text.replace("'", '"'))
website = re.findall(r'\s*([\u4e00-\u9fa5]+)', config_obj['website'])[0]
这部分内容有在另一个文件里讲述
获取第一级文件的配置
需要获取的内容:文章的标题、文章的url、文章的html
def prase(self, response) :
config_obj = response.meta['config_obj']
names = response.xpath(config_obj['titles']).extract()
website = re.findall(r'\s([\u4e00-\u9fa5)]+)', config_obj['website'])
info_html = response.xpath(config_obj['title_htmls']).extract()
info_urls = response.xpath(config_obj['info_xpath']).extract()
for name, html, url in zip(names, info_html, info_urls) :
new_url = parse.urljoin(response.url, url)
yield Request(url=new_url, callback=self.get_info)
zip():
zip() 函数用于将可迭代的对象作为参数,将对象中对应的元素打包成一个个元组,然后返回由这些元组组成的对象,这样做的好处是节约了不少的内存。
zip输出得到是的一个地址,我们可以使用 list() 转换来输出列表。
还有