python代理IP爬虫贝壳抓取房源信息

python爬虫实现贝壳抓取房源信息

  1. 准备工作
  2. 安装爬虫环境
  3. 使用BeautifulSoup对网页数据抓取
  4. 将数据插入MySQL数据库
准备环境
  • 安装Python3.9环境https://www.python.org/
  • 安装pymysql,BeautifulSoup,fake_useragent
获取代理IP池
# 用到的库
import json
import os
import random
import traceback

import bs4
import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent


class IpProxyUtils:
    # 获取Ip池网站
    __proxyUrl = 'http://www.goubanjia.com/'
    # 是否使用代理IP
    __isProxy = True
    # 本地缓存的IP池
    __ipList = []

    def __init__(self, proxyUrl, isProxy):
        if proxyUrl is not None:
            self.__proxyUrl = proxyUrl
        if isProxy is not None:
            self.__isProxy = isProxy

    # 随机产生一个User-Agent
    def mkUser_Agent(self):
        try:
            header = self.ua.random
        except:
            print("获取User_Agent失败")
            traceback.print_exc()
        return header

    # 模仿浏览器头部信息
    def mkHead(self):
        # 随机产生一个User-Agent
        userAgent = self.mkUser_Agent()
        header = {
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
            'Accept-Encoding': 'utf-8',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Connection': 'keep-alive',
            'User-Agent': userAgent
        }
        return header

    ua = UserAgent()

    def getIp(self):
        if self.__ipList == []:
            self.getProxyIp()
            print("代理ip池为空,正在初始化ip池")
            if self.__ipList != []:
                ip = random.choice(self.__ipList)
                return ip
            else:
                return None
        else:
            ip = random.choice(self.__ipList)
            return ip

    def getProxyIp(self):
        if self.__ipList != []:
            return
        # 读取本地缓存IpProxy
        textIpProxy = open('IpProxy.text', 'w')
        # 文件为空
        if os.path.getsize('IpProxy.text') == 0:
            # 获取Web上的Ip
            response = self.getIpListWeb()
            # 将list装成Json字符串并写入本地文件
            textIpProxy.write(json.dumps(response, indent=2, ensure_ascii=False))
            textIpProxy.flush()
            textIpProxy.close()
        text = open('IpProxy.text').read()
        responseJson = json.loads(text)
        self.__ipList = responseJson

    """
        读取URL
    """

    def readWeb(self, url):
        head = self.mkHead()
        if self.__isProxy:
            proxy = self.getIp()
            response = requests.get(url=url, headers=head, proxies=proxy, timeout=(3, 7))
            if response.status_code == 200:
                return response.text
        else:
            response = requests.get(url=url, headers=head)
            if response.status_code == 200:
                return response.text
        return None

    """
        从http://www.goubanjia.com/获取代理IP
    """

    def getIpListWeb(self):
        print("正在获取WebIpProxy")
        head = self.mkHead()
        response = requests.get(url=self.__proxyUrl, headers=head)
        if response.status_code != 200:
            print("获取WebIpProxy失败")
            return None
        soup = BeautifulSoup(response.text, "html.parser")
        trList = soup.select("tbody > tr")
        ipList = []
        for tr in trList:
            ips = tr.contents[1].contents
            tempIp = ''
            for ip in ips:
                if type(ip) == bs4.element.NavigableString:
                    tempIp = tempIp + ip.string.replace(' ', '')
                    continue
                style = ip.attrs.get('style')
                if style is not None:
                    style = style.replace(' ', '')
                if (style is None or 'display:none' not in style) and len(ip.contents) != 0:
                    tempIp = tempIp + ip.contents[0].string.replace(' ', '')
            http = tr.contents[5].next.next
            ipList.append({http: http + "://" + tempIp})
        return ipList
获取房源信息Html
# -*- coding: UTF-8 -*-
import re
import sys
import pymysql

# 获取数据源
from IpProxyUtils import *

class CrawlerBk:


    #IP池管理器
    __ipProxyUtils = None
    #数据库链接
    __myConnect = None
    # 使用cursor()方法获取操作游标
    __myCursor = None
    #最大分页
    __pageSize = 1

    def __init__(self):
        self.__ipProxyUtils = IpProxyUtils(None, False)
        self.__myConnect = self.connectMysql()
        self.__myCursor = self.__myConnect.cursor()


    def connectMysql(self):
        # 打开数据库连接
        try:
            db = pymysql.connect("localhost", "root", "root", "testDb", charset='utf8')
        except BaseException:
            traceback.print_exc()
            print("获取数据链接失败")
            db.close()
        return db




    # 根据首页获取地区URL
    def openHtml(self, url):

        html = self.__ipProxyUtils.readWeb(url)
        soup = BeautifulSoup(html, "html.parser")
        # 根据所有地区地址
        dqList = soup.select("div[data-role='ershoufang'] > div > a[class='CLICKDATA']")
        for dq in dqList:
            # 地区,小区,名称,单价,总价,楼层,建设年度,户型,面积,朝向,url
            data = {}
            # 地区
            dq1 = dq.string
            url1 = dq.get('href')
            #myThread(url + url1, dq1).start()
            for i in range (1, 11):
                list = self.openFy(url + url1.split('/')[2] + '/pg%d'%i + 'p1p2p3' , dq1)
                self.insertData(list)


    # 获取房源信息
    def openFy(self, url, dq):
        fyListData = []
        html = self.__ipProxyUtils.readWeb(url)
        soup = BeautifulSoup(html, "html.parser")
        fyListSoup = soup.select(".sellListContent > .clear > div[class='info clear']")
        for fyInfo in fyListSoup:
            fyData = []
            fyData.append(fyInfo.select(".title > a")[0].string.strip().replace(' ', ''))
            fyData.append(fyInfo.select(".title > a")[0].get("href"))
            houseInfo = fyInfo.select(".address > .houseInfo")[0].next_element.next_element.next
            fyData.append(houseInfo.strip().replace('\n', '').replace(' ',''))
            jgInfo = fyInfo.select(".address > .priceInfo")[0]
            # 单价
            fyData.append(float(re.findall(r"\d+\.?\d*",jgInfo.select(".totalPrice > span")[0].string)[0]))
            # 总价
            fyData.append(float(re.findall(r"\d+\.?\d*", jgInfo.select(".unitPrice > span")[0].string.strip().replace(' ', ''))[0]))
            fyData.append(dq)
            fyListData.append(tuple(fyData))
        return fyListData


    def insertData(self, fyListData):
        try:
            # 使用execute方法执行SQL语句
            sql = "insert fyxx(mc,url,info,zj,dj,dq) values('%s','%s','%s','%.5f','%.5f','%s')"
            #cursor.executemany(sql % fyListData)
            for fy in fyListData:
                self.__myCursor.execute(sql % fy)
            self.__myConnect.commit()
            print("数据插入成功")
        except:
            print("插入数据失败")
            traceback.print_exc()

    def init(self, url):
        self.openHtml(url)


if __name__ == "__main__":
    crawlerBk = CrawlerBk()
    crawlerBk.init("https://cd.ke.com/ershoufang/")
    sys.exit(0)

  • 4
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值