1.首先进行pycharm与数据库sqlite的连接
数据库的创建:
import sqlite3
conn=sqlite3.connect("test.db")
c=conn.cursor()
sql = '''
create table homes
(
id integer primary key autoincrement,
info_link text,
place text,
xiaoqu text,
size numeric,
chaoxiang varchar,
huxing text,
type text,
num numeric,
subway text,
decoration text,
key text,
new text,
time text,
money numeric
)
'''
c.execute(sql)
conn.commit()
conn.close()
print("ok")
执行后同目录出现test.db数据库文件
连接后数据库表如下
2.爬虫代码:
import re
from bs4 import BeautifulSoup
import urllib.request,urllib.error
import sqlite3
#爬虫伪装函数
def askURL(url):
head = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36"
}
request = urllib.request.Request(url, headers=head)
html = ""
try:
response = urllib.request.urlopen(request)
html = response.read().decode("utf-8")
except urllib.error.URLError as e:
if hasattr(e,"code"):
print(e.code)
if hasattr(e,"reason"):
print(e.reason)
return html
#正则表达式
# 区域
findplace1 = re.compile(
r'<a href="/zufang/.*" target="_blank">(.*)</a>-<a href="/zufang/.* target="_blank">.*</a>-<a href="/zufang.*" target="_blank" title=".*">.*</a>')
findplace2 = re.compile(
r'<a href="/zufang/.*" target="_blank">.*</a>-<a href="/zufang/.* target="_blank">(.*)</a>-<a href="/zufang.*" target="_blank" title=".*">.*</a>')
findplace3 = re.compile(
r'<a href="/zufang/.*" target="_blank">.*</a>-<a href="/zufang/.* target="_blank">.*</a>-<a href="/zufang.*" target="_blank" title=".*">(.*)</a>')
#房子大小
finddaxiao = re.compile(r'<i>/</i>(.*)<i>/</i>.*<i>/</i>.*<span class="hide">',re.S) #re.s让换行符包含在字符中
#房子朝向
findfangxiang = re.compile(r'<i>/</i>.*<i>/</i>(.*)<i>/</i>.*<span class="hide">',re.S)
#房子规格
findguige = re.compile(r'<i>/</i>.*<i>/</i>.*<i>/</i>(.*)<span class="hide">',re.S)
#楼层类型
findleixing = re.compile(r'<p class="content__list--item--des">.*<i>/</i>(.*)</span>.*</p>.*<p class="content__list--item--bottom oneline">',re.S)
#是否靠近地铁
findsubway = re.compile(r'<i class="content__item__tag--is_subway_house">(.*)</i>')
#是否是精装
finddecoration = re.compile(r'<i class="content__item__tag--decoration">(.*)</i>')
#是否可以随时看房
findkey = re.compile(r'<i class="content__item__tag--is_key">(.*)</i>')
#是否是新上的
findnew = re.compile(r'<i class="content__item__tag--is_new">(.*)</i>')
#维护时间
findtime = re.compile(r'<span class="content__list--item--time oneline">(.*)</span>')
#平均租金
findmoney = re.compile(r'<span class="content__list--item-price"><em>(.*)</em>')
#解析数据
def getData(baseurl): # 调用获取页面信息的函数
datalist = [] # 分配暂存的空间
for i in range(0, 5):
url = baseurl + str(i)
html = askURL(url) # 保存获取到的网页源码
# 逐一解析数据(边获取边解析)
soup = BeautifulSoup(html, "html.parser")
for item in soup.find_all('div', class_="content__list--item"):
# print(item) #测试:查看链家item全部信息
data = []
item = str(item)
place1 = re.findall(findplace1, item)[0] # re库用来通过正则表达式查找指定的字符串
place2 = re.findall(findplace2, item)[0]
place3 = re.findall(findplace3, item)[0]
place = place1 + '-' + place2
data.append(place)
data.append(place3)
daxiao = re.findall(finddaxiao, item)[0]
daxiao = daxiao.strip()
data.append(daxiao.replace("㎡", ""))
fangxiang = re.findall(findfangxiang, item)[0]
data.append(fangxiang.replace(" ", ""))
guige = re.findall(findguige, item)[0]
data.append(guige.replace(" ", ""))
leixing1 = re.findall(findleixing, item)[0]
leixing2 = leixing1.strip() # 去掉前后空格
leixing3 = leixing2.replace(" ", "") # 将空格替换掉
data.append(leixing3[0:3])
data.append(leixing3[4:8].replace("层)", ""))
subway = re.findall(findsubway, item)
if (len(subway)) != 0:
subway = subway[0]
data.append(subway)
else:
data.append("不靠近地铁")
decoration = re.findall(finddecoration, item)
if len(decoration) != 0:
decoration = decoration[0]
data.append(decoration)
else:
data.append("不是精装")
key = re.findall(findkey, item)
if len(key) != 0:
key = key[0]
data.append(key)
else:
data.append("不是随时看房")
new = re.findall(findnew, item)
if len(new) != 0:
new = new[0]
data.append(new)
else:
data.append("不是新上")
time = re.findall(findtime, item)[0]
data.append(time)
money = re.findall(findmoney, item)[0]
data.append(money)
datalist.append(data) # 将data中的数据放入datalist中
return datalist
#存入数据库
def saveData2DB(datalist):
conn = sqlite3.connect("test.db") # 链接数据库
cur = conn.cursor()
for data in datalist:
for index in range(len(data)):
if index == 2 or index == 6 or index == 12: # 遇见numeric类型时不转换成"xx"型
continue
data[index] = '"' + data[index] + '"' # 转换成"xx"型
sql = '''
insert into homes (
place,xiaoqu,size,chaoxiang,huxing,type,num,subway,decoration,key,new,time,money)
values(%s)''' % ",".join(data)
cur.execute(sql) # 执行sql语句
conn.commit() # 提交结果
cur.close() # 关闭游标
conn.close() # 关闭连接
print("save....")
#执行程序入口
if __name__ == "__main__":
baseurl = 'https://sh.lianjia.com/zufang/'
datalist=getData(baseurl)
#print(datalist)
saveData2DB(datalist)
结果:
如图显示,存储成功,数据存入到sqlite数据库