import urllib.request
import re
import sqlite3
class MySpider:
def openDB(self):
# 初始化数据库,创建数据库rates.db与一张空表rates
self.con = sqlite3.connect("rates.db")
self.cursor = self.con.cursor()
try:
self.cursor.execute("drop table rates")
except:
pass
sql = "create table rates (Currency varchar(256) primary key,TSP float,CSP float, TBP float, CBP float,Time varchar(256))"
try:
self.cursor.execute(sql)
except:
pass
def closeDB(self):
# 关闭数据库
self.con.commit()
self.con.close()
def insertDB(self, Currency, TSP, CSP, TBP, CBP, Time):
# 记录插入数据库
try:
sql = "insert into rates (Currency,TSP,CSP,TBP,CBP,Time) values (?,?,?,?,?,?)"
self.cursor.execute(sql, [Currency, TSP, CSP, TBP, CBP, Time])
except Exception as err:
print(err)
def show(self):
# 显示函数
self.cursor.execute("select Currency,TSP,CSP,TBP,CBP,Time from rates")
rows = self.cursor.fetchall()
print("%-18s%-12s%-12s%-12s%-12s%-12s" % ("Currency", "TSP", "CSP", "TBP", "CBP", "Time"))
for row in rows:
print("%-18s%-12.2f%-12.2f%-12.2f%-12.2f%-12s" % (row[0], row[1], row[2], row[3], row[4], row[5]))
def match(self, t, s):
# 匹配函数
m = re.search(r"<" + t, s)
if m:
a = m.start()
m = re.search(r">", s[a:])
if m:
b = a + m.end()
return {"start": a, "end": b}
return None
def spider(self, url):
# 爬虫函数
try:
resp = urllib.request.urlopen(url)
data = resp.read()
html = data.decode()
m = re.search(r'<table class="pj_table">', html)
html = html[m.end():]
m = re.search(r'</table>', html)
# 取出<div id="realRateInfo">...</div>部分
html = html[:m.start()]
i = 0
while True:
p = self.match("tr", html)
q = self.match("/tr", html)
if p and q:
i = i + 1
a = p["end"]
b = q["start"]
tds = html[a:b]
row = []
count = 0
while True:
m = self.match("td", tds)
n = self.match("/td", tds)
if m and n:
u = m["end"]
v = n["start"]
count += 1
if count <= 6:
row.append(tds[u:v].strip())
tds = tds[n["end"]:]
else:
# 匹配不到<td>...</td>,退出内层循环
break
if i >= 2 and len(row) == 6:
Currency = row[0]
TSP = float(row[1])
CSP = float(row[2])
TBP = float(row[3])
CBP = float(row[4])
Time = row[6]
self.insertDB(Currency, TSP, CSP, TBP, CBP, Time)
html = html[q["end"]:]
else:
# 匹配不到<tr>...</tr>,退出外层循环
break
except Exception as err:
print(err)
def process(self):
# 爬取过程
self.openDB()
self.spider("https://www.psbc.com/cn/common/bjfw/whpjcx/")
self.show()
self.closeDB()
# 主程序
spider = MySpider()
spider.process()