豆瓣爬取
导包
pip install package
from bs4 import BeautifulSoup # decode,get data
import re # regular expression
import urllib.request, urllib.error # request web
import xlwt # excel
import sqlite3 # SQLite3
urllib测试
一个测试网站:http://httpbin.org/
有坑:
所有字符串用双引号,否则报错,但也就第一次报,成功后再改为单引号没有触发
get
import urllib.request
import urllib.parse
response = urllib.request.urlopen("http://www.baidu.com")
with open("baidu.html", "w") as file:
file.write(response.read().decode("utf-8"))
post
posturl = "http://httpbin.org/post"
data = bytes(urllib.parse.urlencode(), encoding="utf-8")
try:
resonpse = urllib.request.urlopen(posturl, data,timeout=2)
print(resonpse.read().decode("utf-8"))
except urllib.error.URLError as e:
print("time out")
BeautifulSoup
基础
from bs4 import BeautifulSoup
file = open("./baidu.html", 'rb')
html = file.read()
bs = BeautifulSoup(html, "html.parser")
# print(bs.title)
# print(bs.a)
# print(bs.a.parent)
# 1. Tag
print(bs.title)
print(type(bs.title))
# 2. NavigableString
print(bs.title.string)
print(type(bs.title.string))
print(bs.a.attrs)
print(type(bs.a.attrs))
print("-"*30)
# 3. BeautifulSoup all content
print(type(bs))
# print(bs)
# 4. Comment annotation
print(type(bs.a.string))
文档遍历
# Document traversal
for content in bs.head.contents:
print(content)
文档搜索
# 1. find all a tag
a_list = bs.find_all("a")
print(a_list)
# 2. regular expression ***
contain_a_list = bs.find_all(re.compile("a"))
print(contain_a_list)
# 3. Take function as parameter
def name_is_exists(tag):
return tag.has_attr("name")
tag_list = bs.find_all(name_is_exists)
print(tag_list)
# 2. kwargs
head_list = bs.find_all(id="head")
for item in head_list:
print(item)
# 3. text arg
text_list = bs.find_all(text="hao123")
print(text_list)
# 4. css selector
print(bs.select("#head .link"))
list_t = bs.select("head > title")
print(list_t)
正则表达式
import re
pat = re.compile("AA")
m = pat.search("CAABAAAS")
print(m)
print(re.findall("[a-z]", "absBcDdsa"))
print(re.sub(r"a\n", "A", "a\nbcdabc")) # replace : use A replace a
保存到Excel
# Author:RaoGang
# -*- coding = utf-8 -*-
# @Time:2021/4/30 14:23
# @File: testXwlt.py
# @Software: PyCharm
import xlwt
wookbook = xlwt.Workbook(encoding="utf-8")
wooksheet = wookbook.add_sheet("sheet1")
for i in range(1, 10):
for j in range(1, i + 1):
wooksheet.write(i, j, "%d * %d = %d" % (j, i, i * j)) # 0,0 write "hello
wookbook.save("student.xls")
保存到数据库
创建表
import sqlite3
conn = sqlite3.connect("test.db")
cursor = conn.cursor()
sql = '''
create table company
( id int primary key not null,
name text not null,
age int not null);
'''
cursor.execute(sql)
conn.commit()
conn.close()
操作表
# Author:RaoGang
# -*- coding = utf-8 -*-
# @Time:2021/4/30 14:50
# @File: testsqlite3.py
# @Software: PyCharm
import sqlite3
conn = sqlite3.connect("test.db")
cursor = conn.cursor()
sql_insert = '''
insert into company (id,name,age)
values (2,'lihua',21);
'''
sql_query = '''
select * from company
'''
sql_update = '''
update company
set name = "lucy",age=22
where id = 2
'''
cursor.execute(sql_update)
res = cursor.execute(sql_query)
for item in res:
print(item)
conn.commit()
conn.close()
完整代码
# Author:RaoGang
# -*- coding = utf-8 -*-
# @Time:2021/4/29 16:37
# @File: main.py
# @Software: PyCharm
from bs4 import BeautifulSoup # decode,get data
import re # regular expression
import urllib.request, urllib.error # request web
import xlwt # excel
import sqlite3 # SQLite3
def main():
baseurl = "https://movie.douban.com/top250?start="
savePath = ".\\dbTop250.xls"
dbPath = ".\\dbTop250.db"
print("hello")
# 1. get webpage
# 2.decode each data
datas = getData(baseurl=baseurl)
# 3.save data
saveData(datas, savePath)
saveToDB(datas, dbPath)
# movie detail link
findlink = re.compile(r'<a href="(.*?)">') # link regular
# movie picture link
findimgsrc = re.compile(r'<img.*src="(.*?)"', re.S)
# movie title
findtitle = re.compile(r'<span class="title">(.*)</span>')
# rating
findrating = re.compile(r'<span class="rating_num" property="v:average">(.*)</span>')
# commit count
findjudge = re.compile(r'<span>(\d*)人评价</span>')
# abstract
findinq = re.compile(r'<span class="inq">(.*)</span>')
# related content
findrelated = re.compile(r'<p class="">(.*?)</p>', re.S)
def getData(baseurl):
datalist = []
for i in range(0, 10):
url = baseurl + str(i * 25)
html = getSpecialUrl(url)
soup = BeautifulSoup(html, "html.parser")
for item in soup.find_all('div', class_="item"):
data = [] # save one movie info
item = str(item)
link = re.findall(findlink, item)[0] # find link string
data.append(link)
picture = re.findall(findimgsrc, item)[0]
data.append(picture)
title = re.findall(findtitle, item)
if len(title) == 2:
ctitle = title[0]
data.append(ctitle)
otitle = title[1].replace("/", "")
data.append(otitle)
else:
data.append(title[0])
data.append(' ') # foreign name
rating = re.findall(findrating, item)[0]
data.append(rating)
judgenum = re.findall(findjudge, item)[0]
data.append(judgenum)
inq = re.findall(findinq, item)
if len(inq) != 0:
data.append(inq[0].replace("。", ''))
else:
data.append(' ')
related = re.findall(findrelated, item)[0]
related = re.sub("<br(\s+)?/>(\s+)?", " ", related) # remove <br/>
related = re.sub("/", " ", related) # remove /
data.append(related.strip())
datalist.append(data)
return datalist
def getSpecialUrl(url):
head = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36 Edg/90.0.818.49"
}
request = urllib.request.Request(url=url, headers=head)
html = ""
try:
response = urllib.request.urlopen(request)
html = response.read().decode("utf-8")
return html
except urllib.error.URLError as e:
if hasattr(e, "code"):
print(e.code)
if hasattr(e, "reason"):
print(e.reason)
def saveData(datalist, savepath):
book = xlwt.Workbook(encoding="utf-8", style_compression=0)
sheet = book.add_sheet("sheet1", cell_overwrite_ok=True)
col = ("详情链接", "图片链接", "电影中文名", "电影外国名", "评分", "评价数", "概况", "相关信息")
for i in range(0, len(col)):
sheet.write(0, i, col[i])
for i in range(0, len(datalist)):
data = datalist[i]
for j in range(0, len(data)):
sheet.write(i + 1, j, data[j])
book.save(savepath)
def initDB(dbPath):
sql = '''
create table movie250
(
id integer primary key autoincrement,
info_link text,
pic_link text,
cname varchar ,
oname varchar ,
rating numeric ,
ratecount numeric ,
introduce text,
info text
);
'''
conn = sqlite3.connect(dbPath)
cursor = conn.cursor()
try:
cursor.execute(sql)
conn.commit()
except Exception as e:
print("数据库建立成功!")
conn.close()
def saveToDB(datas, dbPath):
initDB(dbPath)
conn = sqlite3.connect(dbPath)
cursor = conn.cursor()
for data in datas:
for index in range(len(data)):
data[index] = '"' + data[index] + '"'
sql = "insert into movie250 (info_link,pic_link,cname,oname,rating,ratecount,introduce,info)values(%s)" % ",".join(
data)
cursor.execute(sql)
conn.commit()
conn.close()
print("finish")
if __name__ == "__main__":
main()