注意事项
-
'itag':"v101"可以保证去重,在pyspide的dashboar点击点击run的时候不执行爬取,修改这个版本号就可以了。
直接上代码
from pyspider.libs.base_handler import *
import os
import pymongo
import pandas as pd
import numpy as np
import time
import json
DATABASE_IP = '127.0.0.1'
DATABASE_PORT = 27017
DATABASE_NAME = 'touzi'
client = pymongo.MongoClient(DATABASE_IP,DATABASE_PORT)
db = client.touzi
collection = db.zifang # 准备插入数据
class Handler(BaseHandler):
crawl_config = {
'itag':"v101"
}
@every(minutes=24 * 60)
def on_start(self):
self.crawl('https://zijin.trjcn.com/list_1000.html', callback=self.index_page,validate_cert=False)
@config(age=10 * 24 * 60 * 60)
def index_page(self, response):
for page in range(2,5):
self.crawl('https://zijin.trjcn.com/list_1000_p{}.html?'.format(page), callback=self.detail_page,validate_cert=False)
@config(priority=2)
def detail_page(self, response):
docs = response.doc(".ui-bg-white > .j-hover-all").items()
dicts = []
for item in docs:
title = item(".J_btn_dl_deliver").attr('data-title')
diqu = "".join(item("span").eq(3).text().split())[5:]
hangye = "".join(item("span").eq(4).text().split())[5:]
money = item(".part-money-text-list >span >em").text()
dicts.append({
"title":title,
"money":money,
"diqu":diqu,
"hangye":hangye
})
return dicts
def on_result(self,result):
if result:
self.save_to_mongo(result)
def save_to_mongo(self,result):
df = pd.DataFrame(result)
content = json.loads(df.T.to_json()).values()
if collection.insert_many(content):
print('存储到 mongondb 成功')