都Ajax啦,思路不用讲啦,直接放爬虫的py文件
import scrapy,re
from urllib.parse import urlencode
class HrSpider(scrapy.Spider):
data = {
'pageIndex': '1',
'pageSize': '10',
'language': 'zh-cn',
'area': 'cn'
}
name = 'hr'
allowed_domains = ['tencent.com']
start_url = 'https://careers.tencent.com/tencentcareer/api/post/Query?' + urlencode(data)
start_urls = [start_url]
def parse(self, response):
temp = {}
html = response.text
pattern_LocationName = re.compile('"LocationName":"(.*?)"',re.S)
pattern_RecruitPostName = re.compile('"RecruitPostName":"(.*?)"',re.S)
temp["LocationName"] = re.findall(pattern_LocationName,html)
temp["RecruitPostName"] = re.findall(pattern_RecruitPostName,html)
for LocationName,RecruitPostName in zip(temp["LocationName"],temp["RecruitPostName"]):
item = {}
item["地点"] = LocationName
item["工作名称"] = RecruitPostName
yield item
data = {
'pageIndex': '1',
'pageSize': '10',
'language': 'zh-cn',
'area': 'cn'
}
if int(data['pageIndex']) < 3:
data['pageIndex'] = int(data['pageIndex']) + 1
next_url = 'https://careers.tencent.com/tencentcareer/api/post/Query?' + urlencode(data)
yield scrapy.Request(
next_url,
callback = self.parse
)
这个是pipelines
from pymongo import MongoClient
client = MongoClient()
collection = client["hr"]["info"]
class TencentPipeline(object):
def process_item(self, item, spider):
collection.insert(item)
return item
Over~