目标
爬取这个网站的http://www.wdzb.org.cn/forum.php?mod=forumdisplay&fid=99&typeid=34&typeid=34&filter=typeid&page=1(中国老龄网的新闻资讯板块),从新闻列表获取新闻详情页的链接,爬取详情页的内容。
spider.py
# -*- coding: utf-8 -*-
import scrapy
import re
from zgllw.items import ZgllwItem
class zgllw(scrapy.Spider):
name = 'z'
allowed_domains = ['wdzb.org.cn']
url = 'http://www.wdzb.org.cn/forum.php?mod=forumdisplay&fid=99&typeid=34&typeid=34&filter=typeid&page='
i = 1
start_urls = [url + str(i)]
def parse(self, response):
addrs = response.xpath('//div[@class="list_tit"]/a/@href').extract(