python爬取晋江_python+selenium+PHANTOMJS 爬58同城二手房网站，并入库MongoDB

最新推荐文章于 2022-05-11 14:41:49 发布

weixin_39953356

最新推荐文章于 2022-05-11 14:41:49 发布

阅读量168

点赞数

文章标签： python爬取晋江

import re

from lxml import etree

import datetime

import os

from pymongo import MongoClient

######mongodb操作#######

client=MongoClient('localhost',27017)

db_name='58_ershoufang'

db=client[db_name] #建立数据库

##########分析该目录下所有html文档中的房价等信息###########

path=r"C:\Users\Administrator\Desktop\python试验田\58\全国二手房信息"

dic_path_url={}

for i in os.walk(path): #此命令返回的地址是\\形式的带有转义符号的，所以还是方便下文再次调用的

if i[1]==[]:

dic_path_url[i[0]]=i[2] ##返回的是文件名列表中的第一个值

else:

pass

#print( dic_path_url) #####dic_path_url为所有文件名的值与其路径路径：所有文件列表

######单独获取省份和城市名字######

for j in dic_path_url:

list_sheng_shi=j.split('\\') #将路径名用\分割成列表，因为\同时也是转义符号所以这里要用\\

# print(list_sheng_shi)

sheng_str=list_sheng_shi[-2] #从列表中提取出省名

shi_str=list_sheng_shi[-1] #从列表中提取出市名

# print('省：'+sheng_str)

# print('市：'+shi_str)

file_list=dic_path_url[j]

######mongodb操作#######

col = db[sheng_str+'_'+shi_str] #建立以省_市为名字的表单

######八大函数分析读取信息#######

#不需要这种方法了 dic={} #把字典定义为全局变量，这样方便以后为其赋值，在次循环过程中，字典清空进行下一次的利用

#不需要这种方法了 dic['省份']=sheng_str

#不需要这种方法了 dic['城市']=shi_str #在字典中添加省份和城市字段，以便以后分析用途

CONSTANT = 0 #全局变量，用于记录总收录数量

start_num = 0 #全局变量

end_num = 0 #全局变量

def analysis_all():

count_num=0

for single_file in file_list:

count_num=count_num+1

if re.search('html',single_file): #选择html文件

f= open(j+'\\'+single_file,'rb')

html = f.read().decode('utf-8')

f.close()

print('正在分析第'+str(count_num)+'页:'+j+'\\'+single_file)

# print(html)

def find_xiaoqu():

xiaoqu_list=[] #定义小区空列表，用于查询结果数据以字典形式添加，方便入mongo库，具体见《mongo数据查询》里过程详解，提示，col.updata()

xiaoqu_name=etree.HTML(html).xpath("//ul[@class='house-list-wrap']//p[@class='baseinfo']//a[1]")

# if xiaoqu_name:

# print('find it！')

# else:

# print('没！')

# print('共'+str(len(xiaoqu_name))+'个小区')

for i in range(len(xiaoqu_name)):

global CONSTANT

CONSTANT += 1

xiaoqu_content=xiaoqu_name[i].text #提取该对象中的文字部分，这是xpath的特有方法.或者以这种方法提取.xpath('text()') ，但是提取出来的是列表形式

haha=xiaoqu_name[i].xpath('@href') #提取该对象中的连接地址，这是xpath的特有方法. 结果haha是列表形式

# print(haha)

pat='/xiaoqu/.*'

paten=re.compile(pat)

if haha: #判断haha列表是否为空列表，应为小区信息这个选项为选填，所以有可能信息不全，所以要判断一下，如果不判断列表是否为空，则会报错，调用列表[0]时会提示溢出

if paten.search(haha[0]):

pass

else:

xiaoqu_content='****'

else:

haha=['****']

xiaoqu_content='****'

xiaoqu_list.append({'省份':sheng_str,'城市':shi_str,'房源ID':CONSTANT,'小区名称':xiaoqu_content})

for i in xiaoqu_list:

col.insert_one(i) #把 xiaoqu_list 小区信息列表全部插入mongo数据库建立i个条目，字典形式

global start_num

start_num=CONSTANT-len(xiaoqu_list)+1 #定义本页面起始要更新的房源id号

global end_num

end_num=CONSTANT #定义本页面结束要更新的房源id号

find_xiaoqu() #这样就可以直接运行函数了，其实这里完全没必要用函数，还不如直接运行直线算了，哎，不改了，算了！

# dic['调查房源'+str(CONSTANT)]={'小区名称':xiaoqu_content} #给字典赋值

#不需要这种方法了 CONSTANT=CONSTANT-len(xiaoqu_name) #为了不影响下面字典元素的添加赋值，这里要将CONSTANT复位，直到最后一个函数，然后再重新计算（带着最后一个函数的值），不这样的话，第二个函数都到不了，直接溢出报错！

#不需要这种方法了 return dic

# print(haha)

######单价#######

def find_danjia():

danjia_list=[]

danjia_name=etree.HTML(html).xpath("//div[@class='price']//p[@class='unit']")

# if danjia_name:

# print('find it！')

# else:

# print('没！')

# print('共'+str(len(danjia_name))+'单价')

for i in range(len(danjia_name)):

# global CONSTANT

# CONSTANT += 1

danjia_content=danjia_name[i].text.strip('元/㎡').strip(' ') #提取该对象中的文字部分，这是xpath的特有方法.或者以这种方法提取.xpath('text()') ，但是提取出来的是列表形式

danjia_list.append({'单价(元)':int(danjia_content)})

j=0

for i in col.find({'房源ID':{'$gte':start_num,'$lte':end_num}}): #这里一定要注意！！！！！pymongo里的符号代码一定要加引号，其中'$gte' 为大于等于 '$lte'为小于等于

j+=1

col.update_one(i,{'$set':danjia_list[j-1]})

find_danjia()

# dic_inside=dic['调查房源'+str(CONSTANT)] #因为上面定义dic的第一个key值对应的也是一个字典所以，先定义出来比较方便下面赋值理解

# dic_inside['单价']=danjia_content #给字典中的字典再次添加新的值

# CONSTANT=CONSTANT-len(danjia_name)

# return dic

# print(haha)

#########面积##########

def find_mianji():

mianji_list=[]

mianji_name=etree.HTML(html).xpath("//div[@class='content-wrap']//ul[@class='house-list-wrap']//div[@class='list-info']//p[@class='baseinfo'][1]//span[2]")

# if mianji_name:

# print('find it！')

# else:

# print('没！')

# print('共'+str(len(mianji_name))+'面积')

# print(type(mianji_name))

for i in range(len(mianji_name)):

# global CONSTANT

# CONSTANT += 1

mianji_content=mianji_name[i].text.strip('㎡\xa0') #提取该对象中的文字部分，这是xpath的特有方法.或者以这种方法提取.xpath('text()') ，但是提取出来的是列表形式

mianji_list.append({'面积(平方米)':float(mianji_content)})

j=0

for i in col.find({'房源ID':{'$gte':start_num,'$lte':end_num}}):

j+=1

col.update_one(i,{'$set':mianji_list[j-1]})

find_mianji()

# dic_inside=dic['调查房源'+str(CONSTANT)] #因为上面定义dic的第一个key值对应的也是一个字典所以，先定义出来比较方便下面赋值理解

# dic_inside['面积']=mianji_content #给字典中的字典再次添加新的值

# CONSTANT=CONSTANT-len(mianji_name)

# return dic

# print(haha)

#######总价#########

def find_zongjia():

zongjia_list=[]

zongjia_name=etree.HTML(html).xpath("//div[@class='content-wrap']//ul[@class='house-list-wrap']//div[@class='price']//p[@class='sum']/b")

# if zongjia_name:

# print('find it！')

# else:

# print('没！')

# print('共'+str(len(zongjia_name))+'个总价')

# print(type(zongjia_name))

for i in range(len(zongjia_name)):

# global CONSTANT

# CONSTANT += 1

zongjia_content=float(zongjia_name[i].text)*10000 #提取该对象中的文字部分，这是xpath的特有方法.或者以这种方法提取.xpath('text()') ，但是提取出来的是列表形式

zongjia_list.append({'总价(元)':zongjia_content})

j=0

for i in col.find({'房源ID':{'$gte':start_num,'$lte':end_num}}):

j+=1

col.update_one(i,{'$set':zongjia_list[j-1]})

find_zongjia()

# dic_inside=dic['调查房源'+str(CONSTANT)] #因为上面定义dic的第一个key值对应的也是一个字典所以，先定义出来比较方便下面赋值理解

# dic_inside['总价']=zongjia_content #给字典中的字典再次添加新的值

# CONSTANT=CONSTANT-len(zongjia_name)

# return dic

# # print(haha)

######所在楼层########

def find_suozailouceng():

suozailouceng_list=[]

suozailouceng_name=etree.HTML(html).xpath("//div[@class='content-wrap']//ul[@class='house-list-wrap']//div[@class='list-info']//p[@class='baseinfo'][1]//span[last()]")

# if suozailouceng_name:

# print('find it！')

# else:

# print('没！')

# print('共'+str(len(suozailouceng_name))+'所在楼层')

# print(type(suozailouceng_name))

for i in range(len(suozailouceng_name)):

# global CONSTANT

# CONSTANT += 1

suozailouceng_content=suozailouceng_name[i].text #提取该对象中的文字部分，这是xpath的特有方法.或者以这种方法提取.xpath('text()') ，但是提取出来的是列表形式

suozailouceng_list.append({'所在楼层':suozailouceng_content})

j=0

for i in col.find({'房源ID':{'$gte':start_num,'$lte':end_num}}):

j+=1

col.update_one(i,{'$set':suozailouceng_list[j-1]})

find_suozailouceng()

# dic_inside=dic['调查房源'+str(CONSTANT)] #因为上面定义dic的第一个key值对应的也是一个字典所以，先定义出来比较方便下面赋值理解

# dic_inside['所在楼层']=suozailouceng_content #给字典中的字典再次添加新的值

# CONSTANT=CONSTANT-len(suozailouceng_name)

# return dic

# find_suozailouceng()

########封面#########

def find_fengmian():

fengmian_list=[]

fengmian_name=etree.HTML(html).xpath("//div[@class='content-wrap']//ul[@class='house-list-wrap']//div[@class='pic']//img")

# if fengmian_name:

# print('find it！')

# else:

# print('没！')

# print('共'+str(len(fengmian_name))+'个封面')

# print(type(fengmian_name))

for i in range(len(fengmian_name)):

# global CONSTANT

# CONSTANT += 1

fengmian_content=fengmian_name[i].xpath('@data-src')[0] #提取该对象中的文字部分，这是xpath的特有方法.或者以这种方法提取.xpath('text()') ，但是提取出来的是列表形式

fengmian_list.append({'封面图片地址':fengmian_content})

j=0

for i in col.find({'房源ID':{'$gte':start_num,'$lte':end_num}}):

j+=1

col.update_one(i,{'$set':fengmian_list[j-1]})

find_fengmian()

# dic_inside=dic['调查房源'+str(CONSTANT)] #因为上面定义dic的第一个key值对应的也是一个字典所以，先定义出来比较方便下面赋值理解

# dic_inside['封面照片地址']=fengmian_content #给字典中的字典再次添加新的值

# CONSTANT=CONSTANT-len(fengmian_name)

# return dic

# find_fengmian()

######标题##########

def find_biaoti():

biaoti_list=[]

biaoti_name=etree.HTML(html).xpath("//div[@class='content-wrap']//ul[@class='house-list-wrap']//div[@class='list-info']//h2//a")

# if biaoti_name:

# print('find it！')

# else:

# print('没！')

# print('共'+str(len(biaoti_name))+'个标题')

# print(type(biaoti_name))

for i in range(len(biaoti_name)):

# global CONSTANT

# CONSTANT += 1

biaoti_content=biaoti_name[i].text.strip(' ') #提取该对象中的文字部分，这是xpath的特有方法.或者以这种方法提取.xpath('text()') ，但是提取出来的是列表形式

biaoti_list.append({'标题':biaoti_content})

j=0

for i in col.find({'房源ID':{'$gte':start_num,'$lte':end_num}}):

j+=1

col.update_one(i,{'$set':biaoti_list[j-1]})

find_biaoti()

# dic_inside=dic['调查房源'+str(CONSTANT)] #因为上面定义dic的第一个key值对应的也是一个字典所以，先定义出来比较方便下面赋值理解

# dic_inside['标题']=biaoti_content #给字典中的字典再次添加新的值

# CONSTANT=CONSTANT-len(biaoti_name)

# return dic

# find_biaoti()

#######发布时间#########

def find_shijian():

shijian_list=[]

shijian_name=etree.HTML(html).xpath("//div[@class='content-wrap']//ul[@class='house-list-wrap']//div[@class='time']")

# if shijian_name:

# print('find it！')

# else:

# print('没！')

# print('共'+str(len(shijian_name))+'时间')

# print(type(shijian_name))

for i in range(len(shijian_name)):

# global CONSTANT

# CONSTANT += 1

shijian_content=shijian_name[i].text.strip(' ') #提取该对象中的文字部分，这是xpath的特有方法.或者以这种方法提取.xpath('text()') ，但是提取出来的是列表形式

if shijian_content == '今天':

now = datetime.datetime.now()

shijian_content = now.strftime('%Y-%m-%d')

shijian_list.append({'时间':shijian_content})

else:

shijian_list.append({'时间':'****'})

j=0

for i in col.find({'房源ID':{'$gte':start_num,'$lte':end_num}}):

j+=1

col.update_one(i,{'$set':shijian_list[j-1]})

find_shijian()

# dic_inside=dic['调查房源'+str(CONSTANT)] #因为上面定义dic的第一个key值对应的也是一个字典所以，先定义出来比较方便下面赋值理解

# dic_inside['时间']=shijian_content #给字典中的字典再次添加新的值

# return dic

# find_shijian()

# else:

# pass

print('已经写入Mongo')

analysis_all()

weixin_39953356

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫