scrapy的一个小分支,主要是利用正则,匹配url,
第一个rule实现自动翻页,
第二个rule实现提取内容,
愿大家每天进步一点点,会发现生活如此美好~
能看到这篇文章,相信,老铁已经在爬虫的路上,走过一段时间了,给部分想踏入爬虫这个行业的人,
推荐一本书:《Python爬虫开发与项目实战》
不用买,网上就有,里面都是基础,在面试中磨练基础,在项目中磨练经验
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from tencent_job.items import TencentJobItem
class TxjobSpider(CrawlSpider):
name = 'txjob'
allowed_domains = ['tencent.com']
start_urls = ['https://hr.tencent.com/position.php?&start=0#a']
item = TencentJobItem()
rules = (
Rule(LinkExtractor(allow=r'&start=\d+'), follow=True),
Rule(LinkExtractor(allow=r'\?id=\d+'), callback='parse_item'),
)
def parse_item(self, response):
item = TencentJobItem()
item['title'] = response.xpath('//td[@id="sharetitle"]/text()').extract_first()
item['address'] = response.xpath('//tr[@class="c bottomline"]/td[1]/text()').extract_first()
item['type'] = response.xpath('//tr[@class="c bottomline"]/td[2]/text()').extract_first()
item['count'] = response.xpath('//tr[@class="c bottomline"]/td[3]/text()').extract_first()
item['responsibility'] = response.xpath('//ul[@class="squareli"]//text()').extract()