实验
采集Java工程师招聘的前三页数据,包括职位名称、工作区域地点、工资待遇
输出结果中增加数据的页码和序号;
使用Scrapy框架,不使用其他爬虫插件包和技术。
import scrapy
import json
from scrapy.http import JsonRequest
from colorama import Fore
class S1Spider(scrapy.Spider):
name = "s1"
allowed_domains = ["www.liepin.com"]
start_urls = ["https://www.liepin.com"]
def start_requests(self):
data = {"data": {
"mainSearchPcConditionForm": {"city": "410", "dq": "410", "pubTime": "", "currentPage": 0, "pageSize": 40,
"key": "java", "suggestTag": "", "workYearCode": "", "compId": "",
"compName": "", "compTag": "", "industry": "", "salary": "", "jobKind": "",
"compScale": "", "compKind": "", "compStage": "", "eduLevel": ""},
"passThroughForm": {"ckId": "uxe0na8xeyud6t9fr877ygl4enwdspg9", "scene": "page",
"skId": "xcogeml91yq2ipt6aytdn8vaxui754c9", "fkId": "xcogeml91yq2ipt6aytdn8vaxui754c9",
"sfrom": "search_job_pc"}}
}
url = 'https://api-c.liepin.com/api/com.liepin.searchfront4c.pc-search-job'
headers = {
'X-Client-Type': 'web',
'X-Fscp-Bi-Stat': '{"location": "https://www.liepin.com/zhaopin/?d_sfrom=search_sub_site&key=java&imscid=R000000035"}',
'X-Fscp-Fe-Version': '',
'X-Fscp-Std-Info': '{"client_id": "40108"}',
'X-Fscp-Trace-Id': 'f4153999-1c84-4470-9ab2-60896b386c1f',
'X-Fscp-Version': '1.1',
'X-Requested-With': 'XMLHttpRequest',
'X-XSRF-TOKEN': 'TJHUwifARVqjv6xc7VHozw'
}
for page in range(0, 3):
data['data']['mainSearchPcConditionForm']['currentPage'] = page
yield JsonRequest(data=data, url=url, headers=headers, callback=self.parse, meta={'page': page})
def parse(self, response):
page = response.meta['page']
job = json.loads(response.text)
jobCardList = job["data"]["data"]["jobCardList"]
print('第', page+1, '页')
for i, item in enumerate(jobCardList):
title = item["job"]["title"]
dq = item["job"]["dq"]
salary = item["job"]["salary"]
print(Fore.RED, '第', i+1, '个', Fore.GREEN, '标题:', Fore.RESET, title, Fore.BLUE, '工作区域', Fore.RESET, dq, Fore.YELLOW, '工资待遇', Fore.RESET, salary)
作业
在京东网站首页搜索框中输入一个商品关键字,编写爬虫程序采集搜索结果页面的商品名称和商品价格,仅要求爬取第一页的前30个商品数据,输出所采集的数据,输出结果时一并输出序号;使用Scrapy框架,不使用其他爬虫技术。
import scrapy
from scrapy.http import JsonRequest
from colorama import Fore
class S1Spider(scrapy.Spider):
name = 's1'
allowed_domains = ['search.jd.com']
def start_requests(self):
url = 'https://search.jd.com/Search?keyword=iphone&enc=utf-8&wq=iphone&pvid=7419caf3094144c1b128c5d0476bde84'
yield JsonRequest(url, callback=self.parse)
def parse(self, response):
for i, item in enumerate(response.css('li.gl-item')):
price = ''.join(item.css('.p-price strong').xpath('string()').getall()).replace('\n', ' ').strip()
name = ''.join(item.css('.p-name em').xpath('string()').getall()).replace('\n', ' ').strip()
print(Fore.RED, '第', i+1, '件', Fore.GREEN, '商品名:', Fore.RESET, name, Fore.BLUE, '价格:', Fore.RESET, price)