mongodb的爬取数据与可视化1

Elik-hb

已于 2024-04-21 12:09:30 修改

阅读量235

点赞数 1

分类专栏：大数据/mongo 文章标签： mongodb python 数据库

于 2023-10-04 20:23:38 首次发布

本文链接：https://blog.csdn.net/qq_65099052/article/details/133560580

版权

大数据/mongo 专栏收录该内容

12 篇文章 0 订阅

订阅专栏

图片看不了可以打开我的博客：mongodb的爬取数据与可视化 | Elik (gitee.io)

环境介绍

Linux: Ubuntu 16.04 LTS
mogodb3.6.23
python3.7

源代码

import requests
import time
from lxml import html
from pymongo import MongoClient


# 获取某市区域的所有链接
def get_areas(url, col):
    print('start grabing areas')
    headers = {
        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) \
                      Chrome/63.0.3239.108 Safari/537.36'}
    res = requests.get(url, headers=headers)
    content = html.fromstring(res.text)
    areas = content.xpath('//div[@class="filter-by-area-container"]/ul[@class="district-wrapper"]/li/text()')
    print(areas)
    areas_link = content.xpath('//div[@class="filter-by-area-container"]/ul[@class="district-wrapper"]/li/@data-district-spell')
    print(areas_link)
    for i in range(0, len(areas)):
        area = areas[i]
        area_link = areas_link[i]
        print(area_link)
        link = url+area_link
        print("开始抓取页面:"+link)
        get_pages(area, link, col)


#通过获取某一区域的页数，来拼接某一页的链接
def get_pages(area, area_link, col):
    headers = {
        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36'}
    res = requests.get(area_link, headers=headers)
    content = html.fromstring(res.text)
    #链家新房页面统计每个区域的楼盘个数
    count = int(content.xpath('//div[@class ="page-box"]/@data-total-count')[0])
    #转换成页面，获取每个页面的楼盘信息
    if count%10 :
        pages = count//10+1
    else:
        pages = count//10
    print("这个区域有" + str(pages) + "页")

    for page in range(1, pages+1):
        url = area_link+'/pg' + str(page)+'/#'+area
        print("开始抓取" + str(page) +"的信息")
        get_house_info(area, url, col)


#获取某一区域某一页的详细房租信息
def get_house_info(area, url, col):
    hlist = []
    headers = {
        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)\
         Chrome/63.0.3239.108 Safari/537.36'}
    time.sleep(2)
    try:
        print(url)
        res = requests.get(url, headers=headers)
        content = html.fromstring(res.text)

        for i in range(10):
            try:
                title = content.xpath("//ul[@class='resblock-list-wrapper']/li/a/@title")[i]
                print(title)
                detail_area= content.xpath("//ul[@class='resblock-list-wrapper']/li//div[@class='resblock-location']/span[2]/text()")[i]
                print(detail_area)
                detail_place = content.xpath("//ul[@class='resblock-list-wrapper']/li//div[@class='resblock-location']/a/text()")[i]
                print(detail_place)
                type = content.xpath("//ul[@class='resblock-list-wrapper']/li//div[@class='resblock-name']/span[1]/text()")[i]
                print(type)
                try:
                    square = content.xpath("//ul[@class='resblock-list-wrapper']/li//div[@class='resblock-area']/span/text()")[i]
                except Exception as e:
                    square = ""
                print(square)
                price = content.xpath("//ul[@class='resblock-list-wrapper']/li//div[@class='main-price']/span[1]/text()")[i]
                #价格待定的楼盘设置price为0
                if price=='价格待定':
                    price = 0
                print(price)
                item = {
                    "area": area,
                    "title": title,
                    "type": type,
                    "square": square,
                    "detail_area": detail_area,
                    "detail_place": detail_place,
                    "price": int(price),
                }
                hlist.append(item)
            except Exception as e:
                break
        print('writing work has done!continue the next page')
        col.insert(hlist)
    except Exception as e:
        print(res.text)
        print(url)
        print( 'ooops! connecting error, retrying.....')
        time.sleep(20)


def main():
    print('start!')
    url = 'https://wh.fang.lianjia.com/loupan/'
    client = MongoClient('localhost', 27017)
    db = client.get_database("lianjia")
    col = db.get_collection("loupan")
    get_areas(url, col)


if __name__ == '__main__':
    main()

运行会报错：

代码修改

import requests
import time
from lxml import html
from pymongo import MongoClient

# 获取某市区域的所有链接
def get_areas(url, col):
    print('开始抓取区域链接')
    headers = {
        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) \
                      Chrome/63.0.3239.108 Safari/537.36'}
    res = requests.get(url, headers=headers)
    content = html.fromstring(res.text)
# 提取区域名称和链接
    areas = content.xpath('//div[@class="filter-by-area-container"]/ul[@class="district-wrapper"]/li/text()')
    areas_link = content.xpath('//div[@class="filter-by-area-container"]/ul[@class="district-wrapper"]/li/@data-district-spell')
# 遍历区域并获取页面 
    for i in range(len(areas)):
        area = areas[i]
        area_link = areas_link[i]
        link = url + area_link
        print("开始抓取区域页面：" + link)
        get_pages(area, link, col)

# 通过获取某一区域的页数，来拼接某一页的链接
def get_pages(area, area_link, col):
    headers = {
        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) \
                      Chrome/63.0.3239.108 Safari/537.36'}
    res = requests.get(area_link, headers=headers)
    content = html.fromstring(res.text)
# 获取总页数
    count = int(content.xpath('//div[@class="page-box"]/@data-total-count')[0])
    pages = (count + 9) // 10  # 向上取整得到页数
    print("这个区域有" + str(pages) + "页")
# 遍历每一页并获取房屋信息
    for page in range(1, pages + 1):
        url = f"{area_link}/pg{page}/#{area}"
        print("开始抓取第" + str(page) + "页的信息")
        get_house_info(area, url, col)

# 获取某一区域某一页的详细房租信息
def get_house_info(area, url, col):
    hlist = []
    headers = {
        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) \
                      Chrome/63.0.3239.108 Safari/537.36'}
# 延迟2秒，避免频繁请求网站
    time.sleep(2)
    
    try:
        res = requests.get(url, headers=headers)
        content = html.fromstring(res.text)
        
        for i in range(10):
            try:
                # 提取房屋信息
                title = content.xpath("//ul[@class='resblock-list-wrapper']/li/a/@title")[i]
                detail_area = content.xpath("//ul[@class='resblock-list-wrapper']/li//div[@class='resblock-location']/span[2]/text()")[i]
                detail_place = content.xpath("//ul[@class='resblock-list-wrapper']/li//div[@class='resblock-location']/a/text()")[i]
                type = content.xpath("//ul[@class='resblock-list-wrapper']/li//div[@class='resblock-name']/span[1]/text()")[i]
                
                try:
                    square = content.xpath("//ul[@class='resblock-list-wrapper']/li//div[@class='resblock-area']/span/text()")[i]
                except Exception as e:
                    square = ""
                
                price = content.xpath("//ul[@class='resblock-list-wrapper']/li//div[@class='main-price']/span[1]/text()")[i]
                
                if price == '价格待定':
                    price = 0
                
                item = {
                    "area": area,
                    "title": title,
                    "type": type,
                    "square": square,
                    "detail_area": detail_area,
                    "detail_place": detail_place,
                    "price": int(price),
                }
                
                hlist.append(item)
            except Exception as e:
                break
        
        if hlist:
            col.insert_many(hlist)
        
        print('已完成数据写入，继续下一页')
    
    except Exception as e:
        print('连接错误，重试中...')
        time.sleep(20)

def main():
    print('开始爬取数据')
    url = 'https://wh.fang.lianjia.com/loupan/'
# 连接到MongoDB数据库
    client = MongoClient('localhost', 27017)
    db = client.get_database("lianjia")
    col = db.get_collection("loupan")
# 开始爬取数据
    get_areas(url, col)

if __name__ == '__main__':
    main()

代码讲解：

 areas = content.xpath('//div[@class="filter-by-area-container"]/ul[@class="district-wrapper"]/li/text()')

这行代码是使用XPath语法从网页的HTML内容中提取特定元素的文本信息。

content.xpath(...): 这是使用lxml库的XPath功能来查询HTML文档的方法。
'//div[@class="filter-by-area-container"]/ul[@class="district-wrapper"]/li/text()': 这是XPath表达式，它告诉lxml要查找满足特定条件的元素。
//：表示从文档的任何位置开始查找。
div[@class="filter-by-area-container"]：这部分表示要查找具有class属性为"filter-by-area-container"的<div>元素。
/ul[@class="district-wrapper"]/li：接着，它在找到的<div>元素内部继续查找子元素，首先找到具有class属性为"district-wrapper"的<ul>元素，然后找到所有的<li>元素。
/text()：最后，它取得每个<li>元素的文本内容。

这行代码的作用是从HTML文档中找到具有特定class属性的<div>元素内的<ul>元素下的所有<li>元素，并提取它们的文本内容，然后将这些文本内容存储在名为areas的变量中。通常，这个操作用于从网页中提取区域名称的列表。

采取元素如图：

代码解释

areas_link = content.xpath('//div[@class="filter-by-area-container"]/ul[@class="district-wrapper"]/li/@data-district-spell')

这行代码与前面提到的代码类似，它也使用XPath语法从HTML内容中提取信息。具体来说，这行代码的作用是从HTML文档中找到具有特定class属性的<div>元素内的<ul>元素下的所有<li>元素的data-district-spell属性的值，并将这些值存储在名为areas_link的变量中。

content.xpath(...): 这是XPath查询方法，用于从HTML文档中提取信息。
'//div[@class="filter-by-area-container"]/ul[@class="district-wrapper"]/li/@data-district-spell': 这是XPath表达式，指定了要查找的元素和属性。
//：表示从文档的任何位置开始查找。
div[@class="filter-by-area-container"]：首先，它查找具有class属性为"filter-by-area-container"的<div>元素。
/ul[@class="district-wrapper"]/li：然后，在找到的<div>元素内部继续查找，首先找到具有class属性为"district-wrapper"的<ul>元素，然后找到所有的<li>元素。
/@data-district-spell：最后，它提取这些<li>元素的data-district-spell属性的值。

areas_link 变量中包含了与区域相关的链接的列表，这些链接是从HTML文档中提取的，并与相应的区域名称相关联。

采取元素如图：

方法解释1

这段代码运行后会从指定的网页抓取房地产信息，并将数据存储到MongoDB数据库中。这是一个爬虫程序，它的主要工作流程和功能：

get_areas(url, col) 函数：从指定的 url 中获取某市区域的所有链接。它首先发送HTTP请求，然后使用XPath从网页内容中提取区域名称和链接。接着，它会调用 get_pages() 函数来获取每个区域的页数，并开始抓取每一页的房地产信息。
get_pages(area, area_link, col) 函数：通过获取某一区域的页数，拼接出某一页的链接，然后从该链接中抓取房地产信息。它会发送HTTP请求，解析网页内容，获取该区域的楼盘总数，计算总页数，并开始抓取每一页的房地产信息。抓取的信息会存储在MongoDB数据库的指定集合（col）中。
get_house_info(area, url, col) 函数：获取某一区域某一页的详细房地产信息。它发送HTTP请求，解析网页内容，提取每个楼盘的标题、位置、类型、面积、价格等信息，并将这些信息存储在一个字典中。然后，将这些字典组成的列表存储到MongoDB数据库中。
main() 函数：程序的入口点。它指定了要抓取的初始网页 url，连接到MongoDB数据库，并调用 get_areas() 函数开始抓取数据。

方法解释2

item = {

                    "area": area,

                    "title": title,

                    "type": type,

                    "square": square,

                    "detail_area": detail_area,

                    "detail_place": detail_place,

                    "price": int(price),

                }

例子解释:一个包含房屋信息的字典 item。每个属性（键）对应着房屋的不同信息，例如区域、标题、类型、面积、详细区域、详细地点和价格。请注意，价格最初是一个字符串，但我们使用 int(price) 将其转换为整数，以便在数据库中存储

# 示例房屋信息
area = "某区"
title = "精装两居室"
type = "公寓"
square = "80平方米"
detail_area = "某小区"
detail_place = "某街道"
price = "1200万"  # 注意：价格通常是字符串，需要在插入数据库前进行转换

# 使用字典创建房屋信息
item = {
    "area": area,
    "title": title,
    "type": type,
    "square": square,
    "detail_area": detail_area,
    "detail_place": detail_place,
    "price": int(price),  # 将价格转换为整数
}

# 打印房屋信息
print(item)

运行结果如下：

{'area': '某区', 
'title': '精装两居室', 
'type': '公寓',
 'square': '80平方米',
 'detail_area': '某小区',
 'detail_place': '某街道',
 'price': 1200}

爬取截图：

注意事项

爬取前一定要检查自己的mongo数据库是否已经存在lianjia数据库，有的话要先删除掉

爬取成功如图：

Elik-hb

关注

1
点赞
踩
5

收藏

觉得还不错? 一键收藏
打赏
1
评论
mongodb的爬取数据与可视化1

python3.7。
复制链接

扫一扫

专栏目录