python爬虫实现贝壳抓取房源信息
- 准备工作
- 安装爬虫环境
- 使用BeautifulSoup对网页数据抓取
- 将数据插入MySQL数据库
准备环境
- 安装Python3.9环境https://www.python.org/
- 安装pymysql,BeautifulSoup,fake_useragent
获取代理IP池
# 用到的库
import json
import os
import random
import traceback
import bs4
import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
class IpProxyUtils:
# 获取Ip池网站
__proxyUrl = 'http://www.goubanjia.com/'
# 是否使用代理IP
__isProxy = True
# 本地缓存的IP池
__ipList = []
def __init__(self, proxyUrl, isProxy):
if proxyUrl is not None:
self.__proxyUrl = proxyUrl
if isProxy is not None:
self.__isProxy = isProxy
# 随机产生一个User-Agent
def mkUser_Agent(self):
try:
header = self.ua.random
except:
print("获取User_Agent失败")
traceback.print_exc()
return header
# 模仿浏览器头部信息
def mkHead(self):
# 随机产生一个User-Agent
userAgent = self.mkUser_Agent()
header = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'Accept-Encoding': 'utf-8',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'User-Agent': userAgent
}
return header
ua = UserAgent()
def getIp(self):
if self.__ipList == []:
self.getProxyIp()
print("代理ip池为空,正在初始化ip池")
if self.__ipList != []:
ip = random.choice(self.__ipList)
return ip
else:
return None
else:
ip = random.choice(self.__ipList)
return ip
def getProxyIp(self):
if self.__ipList != []:
return
# 读取本地缓存IpProxy
textIpProxy = open('IpProxy.text', 'w')
# 文件为空
if os.path.getsize('IpProxy.text') == 0:
# 获取Web上的Ip
response = self.getIpListWeb()
# 将list装成Json字符串并写入本地文件
textIpProxy.write(json.dumps(response, indent=2, ensure_ascii=False))
textIpProxy.flush()
textIpProxy.close()
text = open('IpProxy.text').read()
responseJson = json.loads(text)
self.__ipList = responseJson
"""
读取URL
"""
def readWeb(self, url):
head = self.mkHead()
if self.__isProxy:
proxy = self.getIp()
response = requests.get(url=url, headers=head, proxies=proxy, timeout=(3, 7))
if response.status_code == 200:
return response.text
else:
response = requests.get(url=url, headers=head)
if response.status_code == 200:
return response.text
return None
"""
从http://www.goubanjia.com/获取代理IP
"""
def getIpListWeb(self):
print("正在获取WebIpProxy")
head = self.mkHead()
response = requests.get(url=self.__proxyUrl, headers=head)
if response.status_code != 200:
print("获取WebIpProxy失败")
return None
soup = BeautifulSoup(response.text, "html.parser")
trList = soup.select("tbody > tr")
ipList = []
for tr in trList:
ips = tr.contents[1].contents
tempIp = ''
for ip in ips:
if type(ip) == bs4.element.NavigableString:
tempIp = tempIp + ip.string.replace(' ', '')
continue
style = ip.attrs.get('style')
if style is not None:
style = style.replace(' ', '')
if (style is None or 'display:none' not in style) and len(ip.contents) != 0:
tempIp = tempIp + ip.contents[0].string.replace(' ', '')
http = tr.contents[5].next.next
ipList.append({http: http + "://" + tempIp})
return ipList
获取房源信息Html
# -*- coding: UTF-8 -*-
import re
import sys
import pymysql
# 获取数据源
from IpProxyUtils import *
class CrawlerBk:
#IP池管理器
__ipProxyUtils = None
#数据库链接
__myConnect = None
# 使用cursor()方法获取操作游标
__myCursor = None
#最大分页
__pageSize = 1
def __init__(self):
self.__ipProxyUtils = IpProxyUtils(None, False)
self.__myConnect = self.connectMysql()
self.__myCursor = self.__myConnect.cursor()
def connectMysql(self):
# 打开数据库连接
try:
db = pymysql.connect("localhost", "root", "root", "testDb", charset='utf8')
except BaseException:
traceback.print_exc()
print("获取数据链接失败")
db.close()
return db
# 根据首页获取地区URL
def openHtml(self, url):
html = self.__ipProxyUtils.readWeb(url)
soup = BeautifulSoup(html, "html.parser")
# 根据所有地区地址
dqList = soup.select("div[data-role='ershoufang'] > div > a[class='CLICKDATA']")
for dq in dqList:
# 地区,小区,名称,单价,总价,楼层,建设年度,户型,面积,朝向,url
data = {}
# 地区
dq1 = dq.string
url1 = dq.get('href')
#myThread(url + url1, dq1).start()
for i in range (1, 11):
list = self.openFy(url + url1.split('/')[2] + '/pg%d'%i + 'p1p2p3' , dq1)
self.insertData(list)
# 获取房源信息
def openFy(self, url, dq):
fyListData = []
html = self.__ipProxyUtils.readWeb(url)
soup = BeautifulSoup(html, "html.parser")
fyListSoup = soup.select(".sellListContent > .clear > div[class='info clear']")
for fyInfo in fyListSoup:
fyData = []
fyData.append(fyInfo.select(".title > a")[0].string.strip().replace(' ', ''))
fyData.append(fyInfo.select(".title > a")[0].get("href"))
houseInfo = fyInfo.select(".address > .houseInfo")[0].next_element.next_element.next
fyData.append(houseInfo.strip().replace('\n', '').replace(' ',''))
jgInfo = fyInfo.select(".address > .priceInfo")[0]
# 单价
fyData.append(float(re.findall(r"\d+\.?\d*",jgInfo.select(".totalPrice > span")[0].string)[0]))
# 总价
fyData.append(float(re.findall(r"\d+\.?\d*", jgInfo.select(".unitPrice > span")[0].string.strip().replace(' ', ''))[0]))
fyData.append(dq)
fyListData.append(tuple(fyData))
return fyListData
def insertData(self, fyListData):
try:
# 使用execute方法执行SQL语句
sql = "insert fyxx(mc,url,info,zj,dj,dq) values('%s','%s','%s','%.5f','%.5f','%s')"
#cursor.executemany(sql % fyListData)
for fy in fyListData:
self.__myCursor.execute(sql % fy)
self.__myConnect.commit()
print("数据插入成功")
except:
print("插入数据失败")
traceback.print_exc()
def init(self, url):
self.openHtml(url)
if __name__ == "__main__":
crawlerBk = CrawlerBk()
crawlerBk.init("https://cd.ke.com/ershoufang/")
sys.exit(0)