项目名/spiders/爬虫名.py(爬虫,xpath等提取数据和url,发送下一个url请求):
# -*- coding: utf-8 -*-
import scrapy
from tencent.items import TencentItem
class HrSpider(scrapy.Spider):
name = 'hr' # 爬虫名
allowed_domains = ['tencent.com']
start_urls = ['http://hr.tencent.com/position.php']
def parse(self, response):
tr_list = response.xpath("//table[@class='tablelist']/tr")[1:-1]
for tr in tr_list:
item = TencentItem()
item["title"] = tr.xpath("./td[1]/a/text()").extract_first()
item["position"] = tr.xpath("./td[2]/text()").extract_first()
item["publish_date"] = tr.xpath("./td[5]/text()").extract_first()
yield item # 将爬取的数据交给pipelines
# 提取下一页的url地址
next_url =