from bs4 import BeautifulSoup
import pymysql
import sqlite3
import sys
import importlib
importlib.reload(sys)
html = """
<ul class="nav_child"><li><a itemid="39" class="child_link" href="/list-39-1.shtml">北京</a></li>
<li><a itemid="59" class="child_link" href="/list-59-1.shtml">天津</a></li>
<li><a itemid="77" class="child_link" href="/list-77-1.shtml">河北</a></li>
<li><a itemid="80" class="child_link" href="/list-80-1.shtml">河南</a></li>
<li><a itemid="42" class="child_link" href="/list-42-1.shtml">山东</a></li>
<li><a itemid="88" class="child_link" href="/list-88-1.shtml">山西</a></li>
<li><a itemid="97" class="child_link" href="/list-97-1.shtml">内蒙古</a></li>
<li><a itemid="58" class="child_link" href="/list-58-1.shtml">辽宁</a></li>
<li><a itemid="52" class="child_link" href="/list-52-1.shtml">吉林</a></li>
<li><a itemid="82" class="child_link" href="/list-82-1.shtml">黑龙江</a></li>
</ul>
<ul class="nav_child"><li class="child_hd"></li>
<li><a itemid="41" class="child_link" href="/list-41-1.shtml">上海</a></li>
<li><a itemid="65" class="child_link" href="/list-65-1.shtml">江苏</a></li>
<li><a itemid="61" class="child_link" href="/list-61-1.shtml">浙江</a></li>
<li><a itemid="78" class="child_link" href="/list-78-1.shtml">安徽</a></li>
<li><a itemid="90" class="child_link" href="/list-90-1.shtml">江西</a></li>
</ul>
<ul class="nav_child"><li class="child_hd"></li>
<li><a itemid="44" class="child_link" href="/list-44-1.shtml">广东</a></li>
<li><a itemid="79" class="child_link" href="/list-79-1.shtml">广西</a></li>
<li><a itemid="56" class="child_link" href="/list-56-1.shtml">湖南</a></li>
<li><a itemid="46" class="child_link" href="/list-46-1.shtml">湖北</a></li>
<li><a itemid="92" class="child_link" href="/list-92-1.shtml">福建</a></li>
<li><a itemid="hn" class="child_link" href="http://bbs.hainan.net/list-hn-1.shtml">海南</a></li>
</ul>
<ul class="nav_child"><li class="child_hd"></li>
<li><a itemid="45" class="child_link" href="/list-45-1.shtml">重庆</a></li>
<li><a itemid="63" class="child_link" href="/list-63-1.shtml">四川</a></li>
<li><a itemid="178" class="child_link" href="/list-178-1.shtml">贵州</a></li>
<li><a itemid="62" class="child_link" href="/list-62-1.shtml">云南</a></li>
<li><a itemid="153" class="child_link" href="/list-153-1.shtml">西藏</a></li>
<li><a itemid="183" class="child_link" href="/list-183-1.shtml">甘肃</a></li>
<li><a itemid="60" class="child_link" href="/list-60-1.shtml">陕西</a></li>
<li><a itemid="191" class="child_link" href="/list-191-1.shtml">宁夏</a></li>
<li><a itemid="203" class="child_link" href="/list-203-1.shtml">青海</a></li>
<li><a itemid="173" class="child_link" href="/list-173-1.shtml">新疆</a></li>
</ul>
"""
soup = BeautifulSoup(html,'html.parser')
list = soup.find_all('a')
conn = pymysql.connect('localhost','zoe','1235789y','tianyadb',charset='utf8')
cursor = conn.cursor()
for l in list:
try:
print('there')
sql = "insert into citys(city,c_url) values(%s,%s)"
print(sql)
cursor.execute(sql,(str(l.string),str('http://bbs.tianya.cn'+l['href']).encode('utf8')))
print('ok')
conn.commit()
except pymysql.Error as e:
print('************error:',e)
conn.rollback()
cursor.close()
conn.close()
以上为源代码
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
其中遇到的问题
1:中文编码问题
分为4步:1.Python文件设置编码(文件前面加上 #encoding=utf-8)
2.设置数据库编码为utf-8,(charset=utf-8)
3.Python连接mysql数据库时加上参数charset=‘utf8’
4.设置python的默认编码是utf8(sys.setdefaultencoding(utf-8))
注:Python3中取消了这种写法,改用import importlib importlib.reload(sys)
2.insert语句变量插入
在sql语句中将占位符标好,然后cursor.execute()中将变量当做参数加入。(这种方式安全,使用Python字符串传参的方法后面加%,有sql注入的危险)
sql = "insert into citys(city,c_url) values(%s,%s)"
print(sql)
cursor.execute(sql,(str(l.string),str('http://bbs.tianya.cn'+l['href']).encode('utf8')))
3.错误异常处理
4.游标指针cursor的位置问题