Python学习笔记Day4
正则表达式 re
compile()
pat = re.compile("AA")#正则表达式
m = pat.search("AABAA")
print(m)
search()
m = re.search("aa","aaA")
print(m)
findall()
print(re.findall("a","aaaAaa"))
print(re.findall("[A-Z]","aaaABaa"))
print(re.findall("[A-Z]+","aaaABaa"))
sub() 更替
print(re.sub("a","A","abcdsckd")) #找到a用A替换
#建议在正则表达式中,被比较的字符前面加r,不用担心转义字符的问题
a = r"\aabd-\'"
print(a)
Urlliib
打开百度,打印返回的response
response = urllib.request.urlopen("http://www.baidu.com")
print(response.read().decode("utf-8"))
获取post请求
data = bytes(urllib.parse.urlencode({"hello":"word"}),encoding="utf-8")
response = urllib.request.urlopen("http://httpbin.org/post",data = data)
print(response.read().decode("utf-8"))
获取get请求
try:
response = urllib.request.urlopen("http://httpbin.org/get",timeout=5)
print(response.read().decode("utf-8"))
except Exception as result:
print(result)
打印
response = urllib.request.urlopen("http://baidu.com")
print(response.status)
print(response.getheaders())
print(response.getheader("Server"))
访问豆瓣
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36"
}
data = bytes(urllib.parse.urlencode({"name":"hello"}),encoding="utf-8")
url = "http://douban.com"
req = urllib.request.Request(url=url,data=data,headers=headers,method="POST")
response = urllib.request.urlopen(req)
print(response.read().decode("utf-8"))
豆瓣前250部电影列表
from bs4 import BeautifulSoup #网页解析
import re #正则表达式
import urllib.request,urllib.error #制定url
import xlwt #进行excel
import sqlite3 #进行数据库操作
def main():
#爬取网页
#解析数据
#保存数据
baseUrl = "https://movie.douban.com/top250?start="
savePath = ".\\豆瓣评分Top250.xls"
getData(baseUrl)
#得到指定一个URL的网友内容
def askURL(url):
#用户代理,告诉服务器,是什么类型的机器
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36"
}
request = urllib.request.Request(url=url,headers=header)
html = ""
try:
response = urllib.request.urlopen(request)
html = response.read().decode("utf-8")
return html
except Exception as result:
print(result)
#.单个字符
#*0-多次的重复出现
#?0次到1次
findLink = re.compile(r'<a href="(.*?)">')
findImgSrc = re.compile(r'<img.*src="(.*?)"',re.S) #忽视换行符
findTitle = re.compile(r'<span class="title">(.*)</span>')
#评分
findRating = re.compile(r'<span class="rating_num" property="v:average">(.*)</span>')
#评价人数
findCommand = re.compile(r'<span>(\d*)人评价</span>')
#概况
findInq = re.compile(r'<span class="inq">(.*)</span>')
fingBd = re.compile(r'<p class="">(.*?)</p>',re.S)
def getData(baseUrl):
dataList = []
for i in range(0,10):
url = baseUrl + str(i*25)
html = askURL(url)
# 解析数据
soup = BeautifulSoup(html,"html.parser")
for item in soup.find_all('div',class_= "item"):
#print(item)
data = []
item = str(item)
link = re.findall(findLink,item)[0]
data.append(link)
imgSrc = re.findall(findImgSrc,item)[0]
data.append(imgSrc)
titles = re.findall(findTitle,item)
if len(titles)==2:
ctitle = titles[0]
data.append(ctitle)
etitle = titles[1].replace("/","")
data.append(etitle)
else:
data.append(titles[0])
data.append(' ')
rating = re.findall(findRating,item)[0]
data.append(rating)
command = re.findall(findCommand,item)[0]
data.append(command)
inq = re.findall(findInq,item)
if len(inq) !=0:
inq = inq[0].replace("。","")
data.append(inq)
else:
data.append(" ")
bd = re.findall(fingBd,item)[0]
bd = re.sub('<br(\s+)?/>(\s+)?'," ",bd) #去掉br
bd = re.sub('/'," ",bd)
data.append(bd.strip()) #去掉空格
dataList.append(data)
print(dataList)
return dataList
def saveData():
pass
if __name__ == "__main__":
main()