爬虫的基本模板+post请求+get请求+访问豆瓣的练习
import urllib.request
'''
知道自己的浏览器的信息
var ua=navigator.userAgent;
console.log(ua)
'''
"""
url="https://www.baidu.com"
headers = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18363 "}
data1 = urllib.request.Request(url, headers=headers)
response=urllib.request.urlopen(data1)
print(response.read().decode('utf-8'))
"""
"""
import urllib.parse
url="http://httpbin.org/post"
data=bytes(urllib.parse.urlencode({"hello":"word"}),encoding="utf-8")
headers = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18363 "}
data1 = urllib.request.Request(url, headers=headers)
response=urllib.request.urlopen(data1,data=data)
print(response.read().decode('utf-8'))
"""
"""
try:
response=urllib.request.urlopen("http://httpbin.org/get",timeout=0.01)
print(response.read().decode('utf-8'))
except Exception as e:
print(e)
"""
"""
#headers 是一个键值对
headers = {
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18363 "
}
data1=urllib.request.Request("http://douban.com/",headers=headers)
response = urllib.request.urlopen(data1)
# print(response.status) #状态信息
# print(response.getheaders()) #获取全部信息
print(response.getheader("date")) #获取单个信息
"""
"""
标准测试
# url="http://httpbin.org/post"
headers={
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18363 "
}
data=bytes(urllib.parse.urlencode({"name":"盖世凯"}),encoding="utf-8")
req=urllib.request.Request(url=url,data=data,headers=headers,method="POST")
response=urllib.request.urlopen(req)
print(response.read().decode("utf-8"))
"""
url="https://www.douban.com"
headers={
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18363 "
}
req=urllib.request.Request(url=url,headers=headers)
response=urllib.request.urlopen(req)
print(response.read().decode("utf-8"))
用到的外部包+爬取数据+保存数据+解析数据(模板)
import bs4
import re
import urllib.request,urllib.error
import xlwt
import sqlite3
def main():
baseurl="http://www.baidu.com"
datalist=getData(baseurl)
savepath=".\\豆瓣电影Top250.xls"
saveData(savepath)
def saveData(savepath):
pass
def getData(baseurl):
datalist=[]
return datalist
if __name__ == '__main__':
main()
bs4包的基本使用(根据html,css关键字来进行爬取)
"""
BeautifulSoup4 将复杂的HTML文档换成一个复杂的树形结构 每个节点都是python对象,所有对象都可以归纳为4种
Tag
NavigableString
BeautifulSoup
Comment
"""
from bs4 import BeautifulSoup
import re
file=open("daohangtiao.html","rb")
html=file.read()
bs=BeautifulSoup(html,"html.parser")
t_list=bs.select(".mnav~.bri")
for item in t_list:
print(item)
re正则表达式的使用
import re
xlwt表格的使用
import xlwt
workbook=xlwt.Workbook(encoding='utf-8')
worksheet=workbook.add_sheet("sheet1")
for i in range(0,9):
for j in range(0,9):
if(j<=i):
worksheet.write(i,j,"%d * %d = %d"%(j+1,i+1,(i+1)*(j+1)))
workbook.save('student.xls')
开始爬虫
import urllib.request
import re
from bs4 import BeautifulSoup
import xlwt
def main():
baseurl = "https://movie.douban.com/top250?start="
datalist = getData(baseurl)
savepath = "豆瓣电影Top250.xls"
saveData(datalist,savepath)
findLink=re.compile(r'<a href="(.*?)">')
findImgSrc=re.compile(r'<img.*?src="(.*)"/>',re.S)
findTitle=re.compile(r'<span class="title">(.*)</span>')
findRating=re.compile(r'<span class="rating_num" property="v:average">(.*)</span>')
findJudge=re.compile(r'<span>(\d*)人评价</span>')
findInq=re.compile(r'<span class="inq">(.*)</span>')
findBd=re.compile(r'<p class="">(.*?)</p>',re.S)
def saveData(datalist,savepath):
print("save~~~")
book=xlwt.Workbook(encoding="utf-8",style_compression=0)
sheet=book.add_sheet("豆瓣电影Top250",cell_overwrite_ok=True)
col=("电影详情连接","图片链接","影片中文名","影片外国名","评分","平分数","概况","相关消息")
for i in range(0,8):
sheet.write(0,i,col[i])
for i in range(0,250):
print("第%d行了"%(i+1))
data=datalist[i]
for j in range(0,8):
sheet.write(i+1,j,data[j])
book.save(savepath)
def getData(baseurl):
datalist = []
for i in range(0,10):
url=baseurl+str(i*25)
html=askURL(url)
soup=BeautifulSoup(html,"html.parser")
for item in soup.find_all("div",class_="item"):
data=[]
item=str(item)
link=re.findall(findLink,item)[0]
data.append(link)
imgSrc=re.findall(findImgSrc,item)[0]
data.append(imgSrc)
titles=re.findall(findTitle,item)
if len(titles)==2:
ctitle=titles[0]
data.append(ctitle)
otitle=titles[1].replace("/","")
data.append(otitle)
else:
data.append(titles[0])
data.append('')
rating=re.findall(findRating,item)[0]
data.append(rating)
judgeNum=re.findall(findJudge,item)
data.append(judgeNum)
inq=re.findall(findInq,item)
if len(inq) !=0:
inq=inq[0].replace("。","")
data.append(inq)
else:
data.append("")
bd=re.findall(findBd,item)[0]
bd=re.sub('<br(\s+)?/>(\s+)?','',bd)
bd=re.sub('/','',bd)
data.append(bd.strip())
datalist.append(data)
return datalist
def askURL(url):
head={
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18363 "
}
html=""
request = urllib.request.Request(url=url, headers=head)
response=urllib.request.urlopen(request)
html=response.read().decode("utf-8")
return html
if __name__ == '__main__':
main()