spider
import time
import scrapy
from shell.items import ShellItem
from lxml import etree
from inline_requests import inline_requests
from fake_useragent import UserAgent
from scrapy.selector import Selector
from shell.dataprocess import get_ymd_time
from lxml import html
class ShellscrpySpider(scrapy.Spider):
name = 'Allcity'
allowed_domains = ['bj.ke.com']
start_urls = ['https://xa.ke.com/xiaoqu/']#https://www.ke.com/city'] # https://bj.ke.com/ershoufang/']
headers = {
"User-Agent": UserAgent().random,#随机UserAgent
"Cookie": '填自己的',
"Referer":"https://xa.ke.com/xiaoqu/weiyang/"
# "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9
# Accept-Encoding: gzip, deflate, br"
}
classItems = {
# "dbName": "esfcommunity", "collectionName": "beke",
"province": "陕西省", "city": "西安市", "spiderDate": get_ymd_time()
}
def start_requests(self): # 必要的请求结构
page=1
url = self.start_urls[0].format(page) # 构造请求体
yield scrapy.Request(url=url, headers=self.headers, dont_filter=True) # dont_filter=True, meta={"page": page})
@inline_requests
def parse(self, response):
# item = ShellItem()
res=response
quurl = res.xpath("//div[@data-role=\"ershoufang\"]/div/a/@href").extract()
#print(quurl)
listquurl=["".join(["https://xa.ke.com",i]) for i in quurl]
items=ShellItem()
items.update(self.classItems)
print(len(listquurl))
for i in range(len(listquurl)):#len(listquurl)):
url=listquurl[i]
# print(url)
res1=yield scrapy.Request(url=url, headers=self.headers, dont_filter=True)
# print(res1.text)
#二级页面
endnum=res1.xpath("//div[@class=\"page-box fr\"]/div/@page-data").get()#判断有多少页
a1=endnum.split("\":")[1].split(",\"")
#print("当前爬取到:",a1[1].split("\":")[1])
list2=["".join([url,"pg",str(j)]) for j in range(1,int(a1[0])+1)]
# print(list2)
#print(len(list2))
time.sleep(2)
for k in range(len(list2)):
print(list2[k])
res3 = yield scrapy.Request(url=list2[k], headers=self.headers, dont_filter=True)
# print(res3.text)
res3_xpath=res3.xpath("//ul[@class=\"listContent\"]/li")
for rx in res3_xpath:
# items = ShellItem()
# items.update(self.classItems)
rx1=etree.HTML(rx.get())#Selector(
items["projectName"]=rx1.xpath("//div[1]/div[1]/a[@class=\"maidian-detail\"]/text()")[0]#.get()#小区名称
try:
items["referencePrice"]=rx1.xpath("//div[2]/div[1]/div[@class=\"totalPrice\"]/span/text()")[0]#.get()#价格
except:
items["referencePrice"] ="--"
items["region"]=rx1.xpath("//div[1]/div[3]/a[position()<2]/text()")[0]+"区"
items["updateDate"]="2022-0"+rx1.xpath("//div[2]/div[1]/div[@class=\"priceDesc\"]/text()")[0].split("月")[0]#.get()
# print(items["projectName"])
# print(items["referencePrice"])
# print(items["region"])
# print(items["updateDate"])
yield items
pipline
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
import pymongo as pymongo
from .items import ShellItem
import pandas as pd
class ShellPipeline:
def __init__(self,mongo_url_bk,mongo_db_bk,mongo_coll_bk,mongo_account_number,mongo_password):
self.mongo_url_bk = mongo_url_bk
self.mongo_db_bk = mongo_db_bk
self.mongo_coll_bk = mongo_coll_bk
self.mongo_account_number=mongo_account_number
self.mongo_password=mongo_password
# self.MONGO_HOST=settings['MONGO_HOST']
# self.MONGO_PORT=settings['MONGO_PORT']
# self.MONGO_URL_BK = settings['MONGO_URL_BK']
# self.MONGO_DB = settings['MONGO_DB']
# self.MONGO_COLL = settings['MONGO_COLL']
#super().__init__()
@classmethod
def from_crawler(cls, crawler):
"""
scrapy为我们访问settings提供了这样的一个方法,这里,
我们需要从settings.py文件中,取得数据库的URI和数据库名称
"""
return cls(mongo_url_bk=crawler.settings.get('MONGO_URL_BK'),
mongo_db_bk=crawler.settings.get('MONGO_DB_BK'),
mongo_coll_bk=crawler.settings.get('MONGO_COLL'),
mongo_account_number=crawler.settings.get('MONGO_ACCOUNT_NUMBER'),
mongo_password=crawler.settings.get('MONGO_PASSWORD')
)
def open_spider(self, spider):
"""
爬虫一旦开启,就会实现这个方法,连接到数据库
"""
#client = pymongo.MongoClient(host=self.MONGO_HOST], port=self.MONGO_PORT)
self.client_bk = pymongo.MongoClient(self.mongo_url_bk) # 连接数据
self.db = self.client_bk[self.mongo_db_bk] # 获得数据库名称
# self.db.authenticate(self.mongo_account_number,self.mongo_password)
self.coll = self.db[self.mongo_coll_bk]#self.coll = self.db[settings['MONGO_COLL']]
def close_spider(self, spider):
"""
爬虫一旦关闭,就会实现这个方法,关闭数据库连接
"""
self.client_bk.close()
def process_item(self, items, spider):
items1= ItemAdapter(items).asdict()
#items1=items1.pop('dbName')
# del items['classItemName']# 删除不需要字段
# dbName = items.pop('dbName')
#collectionName = 'collectionName'#items.pop('collectionName')
postItem = dict(items1) # 把item转化成字典形式
self.coll.insert_one(postItem)
#self.db[collectionName].insert(postItem) # 向数据库插入一条记
# df=pd.DataFrame(item['city'],item['city_url'])
# print(df)#
# df.to_csv("C:/Users/LiNan/Desktop/data.csv",encoding="UTF-8")
return items1
导入mongodb