python爬虫练习(一)
参考https://blog.csdn.net/qq_40558166/article/details/102868801
网页爬取百度产品名称、连接及描述并存入数据库
基本步骤:
爬取数据->解析数据->存储数据
#主函数
def main():
#爬取的页面
baseurl = "https://www.baidu.com/more/"
# 爬取数据并解析数据
datalist = getData(baseurl)
# 存储数据
dbpath = "bdproduct.db"
saveData(dbpath,datalist)
# 获取网页html内容
def askURL(url):
head = { "User-Agent": "Mozilla / 5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 94.0.4606.81 Safari / 537.36 Edg / 94.0.992.50"
} #伪装身份,用户代理表示告诉服务器我们是什么类型的机器
request = urllib.request.Request(url,headers=head)
html=""
try:
response = urllib.request.urlopen(request)
html = response.read().decode("utf-8") #注意解码
# print(html)
except urllib.error.URLError as e:
if hasattr(e,"code"):
print(e.code)
if hasattr(e,"reason"):
print(e.reason)
return html
def getData(baseurl):
datalist = []
count = 1
#1.爬取网页内容
html = askURL(baseurl)
soup = BeautifulSoup(html,"html.parser")
#2.解析数据
con = soup.find_all("div",class_='con') #根据网页源码,所需内容是类名为con的div
for item in con:
data=[]
proName = item.select('a[target="_blank"]')[1].get_text()
# print(proName)
data.append(proName)
proUrl = item.find('a').get("href")
# print(proUrl)
data.append(proUrl)
proAbout = item.find('span').get_text()
# print(proAbout)
data.append(proAbout)
datalist.append(data)
print("第{}个产品:{}".format(count,data))
count = count+1
# print(datalist)
print("数据爬取成功")
return datalist
#初始化数据库 重复操作会报错
def init_db(dbpath):
sql='''
create table bd_pro(
id integer primary key autoincrement,
proName text,
proUrl text,
proAbout text
)
'''
conn = sqlite3.connect(dbpath)
cursor = conn.cursor()
cursor.execute(sql)
conn.commit()
conn.close()
print("成功建表")
def saveData(dbpath,datalist):
print("Save begin.")
init_db(dbpath)
#存入数据库
conn = sqlite3.connect(dbpath)
cur = conn.cursor()
for data in datalist:
sql = '''insert into bd_pro
(proName,proUrl,proAbout)
values('{}','{}','{}')'''.format(data[0],data[1],data[2])
# values这里的占位符没太搞懂,不知道该什么时候加引号,乱试出来的[o(╥﹏╥)o]
cur.execute(sql)
conn.commit()
cur.close()
conn.close()
print("Save finished.")