1.爬虫介绍
2.准备工作
3.构建流程
# coding:utf-8
# @Time : 22/4/29 11:20
# @Author : Justha
# @File : spider.py.py
# @Software: PyCharm
import bs4
import re
import urllib.request,urllib.error
import xlwt
import sqlite3
def main():
print("这是main")
baseurl = "http://movie.douban.com/top250?start="
#1.爬取网页
datalist = getData(baseurl)
savapath = ".\\豆瓣电影Top250.xls"
#3.保存数据
savaData(savapath)
#爬取数据
def getData(baseurl):
datalist = []
return datalist
def savaData(savapath):
print()
if __name__=="__main__":
main()
4.urllib
getData&askURL
# coding:utf-8
# @Time : 22/5/1 15:00
# @Author : Justha
# @File : testurllib.py
# @Software: PyCharm
# 想放弃了就去给我做运动!
import urllib.request
url = "http://www.douban.com/"
headers={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36"
}
req = urllib.request.Request(url=url,headers=headers)
response = urllib.request.urlopen(req)
print(response.read().decode("utf-8"))
5.获取数据
# coding:utf-8
# @Time : 22/4/29 11:20
# @Author : Justha
# @File : spider.py.py
# @Software: PyCharm
import bs4
import re
import urllib.request,urllib.error
import xlwt
import sqlite3
def main():
print("这是main")
baseurl = "http://movie.douban.com/top250?start="
#1.爬取网页
datalist = getData(baseurl)
savapath = ".\\豆瓣电影Top250.xls"
#3.保存数据
savaData(savapath)
#爬取数据
def getData(baseurl):
datalist = []
for i in range(0,10):
url=baseurl+str(i*25)
print(url)
html=askURL(url)
datalist.append(html)
print(datalist[9])
return datalist
def savaData(savapath):
print()
# 获取网页内容
def askURL(url):
head = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36"
}
request = urllib.request.Request(url,headers=head)
try:
response = urllib.request.urlopen(request)
html = response.read().decode("utf-8")
except urllib.error.URLError as e:
if hasattr(e,"code"):
print(e.code)
if hasattr(e,"reason"):
print(e.reason)
return html
if __name__=="__main__":
main()
6.beautifulsoup
# coding:utf-8
# @Time : 22/5/1 16:04
# @Author : Justha
# @File : testBs4.py
# @Software: PyCharm
# 想放弃了就去给我做运动!
import re
from bs4 import BeautifulSoup
file = open("./baidu.html","rb")
html=file.read()
bs=BeautifulSoup(html,"html.parser")
# # 1.Tag 为获取到的第一个标签及其内容
# print(type(bs))
# print(type(bs.meta)) #类型为Tag
# print(bs.meta)
#
# # 2.NavigableString 标签里的内容(字符串)
# print(bs.a.string)
# print(bs.a.attrs) #拿到标签的属性
#
# # 3.BeautifulSoup 整个文件
# # print(bs)
#
# # 4.注释(会将string里的注释去掉,拿到内容)
# print(bs.a.string) #这里输出为新闻1,而不是<!--新闻1-->
#
# # 文档遍历
# print(bs.head.contents)
# print(bs.head.contents[1])
# 文档搜索
# # 1)find_all findAll???
# # 字符串过滤,只会查找完全一致的内容
# a_list=bs.find_all("a",limit=3) #标签 limit限制数量
# print(a_list)
# a_list=bs.find_all(id="head") #id
# print(a_list)
# a_list=bs.find_all(class_=True) #class
# print(a_list)
# a_list=bs.find_all(text=re.compile("\d")) #标签里所有包含数字的内容
# print(a_list)
# # 2)正则表达式
# a_list2=bs.find_all(re.compile("a"))
# print(a_list2)
# 3)select
# list3=bs.select("a") #标签
# list3=bs.select(".mnav") #类
# list3=bs.select("#head") #id
# list3=bs.select("a[class='bri']") # a.bri
# list3=bs.select("head>title") # head title
list3=bs.select(".mnav ~ .bri']") # mnav的所有同级兄弟里的br
for item in list3:
print(item)
7.正则表达式
至保存sql前代码
# coding:utf-8
# @Time : 22/4/29 11:20
# @Author : Justha
# @File : spider.py.py
# @Software: PyCharm
import bs4
import re
import urllib.request,urllib.error
import xlwt
import sqlite3
def main():
print("这是main")
baseurl = "http://movie.douban.com/top250?start="
#1.爬取网页
datalist = getData(baseurl)
# savapath = ".\\豆瓣电影Top250.xls"
#3.保存数据
savaData(datalist)
#爬取数据
def getData(baseurl):
datalist = []
findTitle = re.compile(r'<img.*alt="(.*?)"')
findLink=re.compile(r'<a href="(.*?)">')
findImgSrc=re.compile(r'<img.*src="(.*?)"',re.S) #?代表非贪婪模式,找到一个就停止,re.S代表不管换行符
findScore=re.compile(r'<span class="rating_num" property="v:average">(.*)</span>')
findCommentNum=re.compile(r'<span>(\d*)人评价</span>')
findInq=re.compile(r'<span class="inq">(.*?)</span>')
findBd=re.compile(r'<div class="bd">.*<p class="">(.*?)</p>',re.S)
for i in range(0,10):
url=baseurl+str(i*25)
print(url)
html=askURL(url)
soup = bs4.BeautifulSoup(html, "html.parser")
for item in soup.find_all('div',class_="item"):
data = []
item=str(item)
# 搜索item中匹配正则表达式的内容
title = re.findall(findTitle, item)[0]
score = re.findall(findScore,item)[0]
commentNum=re.findall(findCommentNum,item)[0]
try:
inq = re.findall(findInq, item)[0]
except IndexError as e:
print(e)
bd=re.findall(findBd,item)[0]
link = re.findall(findLink,item)[0]
imgSrc = re.findall(findImgSrc,item)[0]
# 输出那些内容
print("%s %s %s人评价 %s\n%s\n%s\n%s"%(title,score,commentNum,inq,link,imgSrc,bd))
# 下载图片
# savePlace="./downloadFile/"+title+".jpg" #图片下载保持地址
# urllib.request.urlretrieve(imgSrc,savePlace)
# 保存数据
data.append(title)
data.append(score)
data.append(commentNum)
data.append(inq)
data.append(link)
data.append(imgSrc)
# data.append(bd)
# print(data)
datalist.append(data)
# print(datalist)
return datalist
def savaData(datalist):
workbook = xlwt.Workbook(encoding="utf-8",style_compression=0)
worksheet = workbook.add_sheet('sheet666',cell_overwrite_ok=True)
title=('电影名','分数','评论人数','描述','电影链接','电影图片链接')
for i in range(0,6):
worksheet.write(0,i,title[i])
for j in range(0,250):
for k in range(0,6):
worksheet.write(j+1,k,datalist[j][k])
workbook.save('豆瓣Top250.xls')
# 获取网页内容
def askURL(url):
head = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36"
}
request = urllib.request.Request(url,headers=head)
try:
response = urllib.request.urlopen(request)
html = response.read().decode("utf-8")
except urllib.error.URLError as e:
if hasattr(e,"code"):
print(e.code)
if hasattr(e,"reason"):
print(e.reason)
return html
if __name__=="__main__":
main()
print("爬取完毕!")