python用selenium驱动浏览器爬取天府新区二手房价--并展示在网页上

效果网址:http://xiaomokuaipao.com/user/soufangwang/

欢迎关注:
1
一、
1,python version: 3.6.5;
2,Django version: 2.0.5;
3,web 应用测试工具selenium库: pip install selenium
4,浏览器驱动: webdriver,我用的chrome浏览器,需要下载对应浏览器版本的驱动器,参考https://blog.csdn.net/huilan_same/article/details/518966725,
5,用bs4解析网页;
6,用mysql存储数据-注意修改settings的配置;
7,房天下成都天府新区二手房信息:http://cd.esf.fang.com/house-a016418/8,
8,网页html格式用到了bootstrap;

二、原理:
1,用web自动测试工具,驱动chrome浏览器访问网页,得到目标网页后;
2,用BeautifulSoup解析网页,提取需要的信息,将提取出的信息存储在mysql数据库里,然后关闭数据库连接和浏览器;
3,最后用从mysql数据库里将存储的数据展示在网页上。

三、
代码实现:这里只放置了views.py的代码,其他代码简单容易实现。
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import time
import pymysql

def houseinfo(request):
connect = pymysql.connect(user=‘root’, password=‘xxxxxx’, host=‘localhost’, port=3306, db=‘studyuser’,
charset=‘utf8’)
conn = connect.cursor()
conn.execute(“create database if not exists studyuser character set utf8;”)
conn.execute(“use studyuser;”)
conn.execute(‘drop table if exists user_room;’)
sql = “”“create table if not exists user_room (id INT PRIMARY KEY AUTO_INCREMENT,house_title VARCHAR(200),house_room_number VARCHAR(200),house_size VARCHAR(200),house_floor VARCHAR(200),house_diretion VARCHAR(200),
house_location VARCHAR(200),house_total_price VARCHAR(200),house_per_price VARCHAR(200),house_link VARCHAR(200))”""
conn.execute(sql)

browser = webdriver.Chrome()
wait = WebDriverWait(browser, 5)

def get_first_page():
    browser.get('http://cd.esf.fang.com/house-a016418/')
    try:
        submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#closemengceng')))
        submit.click()
        print('done')
    except:
        pass
    time.sleep(1)

    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#list_D10_15 > p:nth-child(10) > a')))
    soup = BeautifulSoup(browser.page_source, 'lxml')
    items = soup.find_all('dl', class_='clearfix')
    # print(items)

    for item in items:
        try:
            house_title = item.find('span').text
        except:
            house_title = False

        try:
            house_type = item.find('p', class_='tel_shop')
            house_room_number = house_type.text[40:50].strip()
            house_size = house_type.text[91:95].strip()
            house_floor = house_type.text[100:200].strip()
            house_diretion = house_type.text[270:300].strip()
        except:
            house_room_number = False
            house_size = False
            house_floor = False
            house_diretion = False

        try:
            house_location = item.find('p', class_='add_shop').find('span').text
        except:
            house_location = False

        house_price = item.find('dd', class_='price_right')
        try:
            house_total_price = house_price.find('span', class_='red').text.strip()
        except:
            house_total_price = False

        try:
            house_per_price = house_price.find('span', class_='').text.strip()
        except:
            house_per_price = False

        try:
            url = item.find('a')

            house_url = 'http://cd.esf.fang.com' + str(url['href'])
            print(house_url)
        except:
            house_url = False

        house_link = house_url

        print(house_title)
        print(house_room_number)
        print(house_size)
        print(house_floor)
        print(house_diretion)
        print(house_location)
        print(house_total_price)
        print(house_per_price)

        if house_title and house_room_number and house_size and house_floor and house_diretion and house_location and house_total_price and house_per_price and house_link:
            conn.execute(
                "insert into user_room (house_title,house_room_number,house_size,house_floor,house_diretion,house_location,house_total_price,house_per_price,house_link) VALUES ('%s','%s','%s','%s','%s','%s','%s','%s','%s')" % (
                house_title, house_room_number, house_size, house_floor, house_diretion, house_location,
                house_total_price, house_per_price, house_link))
            connect.commit()

def get_next_page():
    try:
        submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#list_D10_15 > p:nth-child(10) > a')))
        submit.click()
    except:
        submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#list_D10_15 > p:nth-child(12) > a')))
        submit.click()


    soup = BeautifulSoup(browser.page_source, 'lxml')
    items = soup.find_all('dl', class_='clearfix')

    for item in items:
        try:
            house_title = item.find('span').text
        except:
            house_title = False

        try:
            house_type = item.find('p', class_='tel_shop')
            house_room_number = house_type.text[40:50].strip()
            house_size = house_type.text[91:95].strip()
            house_floor = house_type.text[100:200].strip()
            house_diretion = house_type.text[270:300].strip()
        except:
            house_room_number = False
            house_size = False
            house_floor = False
            house_diretion = False

        try:
            house_location = item.find('p', class_='add_shop').find('span').text
        except:
            house_location = False

        house_price = item.find('dd', class_='price_right')
        try:
            house_total_price = house_price.find('span', class_='red').text.strip()
        except:
            house_total_price = False

        try:
            house_per_price = house_price.find('span', class_='').text.strip()
        except:
            house_per_price = False

        try:
            url = item.find('a')

            house_url = 'http://cd.esf.fang.com' + str(url['href'])
            print(house_url)
        except:
            house_url = False

        house_link = house_url

        print(house_title)
        print(house_room_number)
        print(house_size)
        print(house_floor)
        print(house_diretion)
        print(house_location)
        print(house_total_price)
        print(house_per_price)

        if house_title and house_room_number and house_size and house_floor and house_diretion and house_location and house_total_price and house_per_price and house_link:
            conn.execute(
                "insert into user_room (house_title,house_room_number,house_size,house_floor,house_diretion,house_location,house_total_price,house_per_price,house_link) VALUES ('%s','%s','%s','%s','%s','%s','%s','%s','%s')" % (
                house_title, house_room_number, house_size, house_floor, house_diretion, house_location,
                house_total_price, house_per_price, house_link))
            connect.commit()

get_first_page()
for i in range(2):
    get_next_page()
conn.close()
connect.close()
browser.close()
return redirect('/user/soufangwang/')

def soufangwang(request):
house=room.objects.all()
return render(request,‘user/soufangwang.html’,locals())

四、
Django工作流程原理图:

2

网页效果图:
4
有兴趣也可以看看网站其他未完成页面:用户名:admin密码:123
http://xiaomokuaipao.com/user/index/1/

5
欢迎关注:

6

猫咪:
在这里插入图片描述

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值