[Python] 纯文本查看 复制代码#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2018-01-29 11:56:33
# Project: qcwy
from pyspider.libs.base_handler import *
import pymongo
class Handler(BaseHandler):
crawl_config = {
}
client=pymongo.MongoClient("localhost") # 本地的MongoDB数据库
db=client["tb_qcwy"] # 数据库名
@every(minutes=24 * 60)
def on_start(self):
self.crawl('http://search.51job.com/jobsearch/search_result.php?fromJs=1&jobarea=030200&keyword=python&keywordtype=2&lang=c&stype=2&postchannel=0000&fromType=1&confirmdate=9',
callback=self.index_page,
validate_cert=False,
connect_timeout = 50,
timeout = 500
)
@config(age=10 * 24 * 60 * 60)
def index_page(self, response):
for each in response.doc('p > span > a').items(): # 每个职位详情链接
self.crawl(each.attr.href, callback=self.detail_page,validate_cert=False)
next=response.doc('.bk > a').attr.href # 下一页链接
self.crawl(next,callback=self.index_page,validate_cert=False)
@config(priority=2)
def detail_page(self, response):
return {
"url": response.url, # 页面地址
"location": response.doc('h1').text(), # 地理位置
"company":response.doc('.cname > a').text(), # 公司名
"work_location":response.doc('.lname').text(), # 工作地点
"salary":response.doc('.cn > strong').text(), # 工资
"requirements":response.doc('.sp4').text(), # 工作需求
"zhiweixinxi":response.doc('.job_msg').text(), # 职位信息
"address":response.doc('.bmsg > .fp').text(), # 公司地址
}
# 保存到MongoDB
def on_result(self,result):
if result:
self.save_to_mongo(result)
def save_to_mongo(self,result):
if self.db["qcwy20180129"].insert(result): # 数据库表名
print("save to mongo",result)