Python学习笔记Day4

飞飞飞KiKi

于 2022-06-07 17:48:38 发布

阅读量140

点赞数

文章标签： python 学习爬虫

本文链接：https://blog.csdn.net/htthr/article/details/125167735

版权

Python学习笔记Day4

正则表达式 re

compile()

pat = re.compile("AA")#正则表达式
m = pat.search("AABAA")
print(m)

search()

m = re.search("aa","aaA")
print(m)

findall()

print(re.findall("a","aaaAaa"))

print(re.findall("[A-Z]","aaaABaa"))

print(re.findall("[A-Z]+","aaaABaa"))

sub() 更替

print(re.sub("a","A","abcdsckd")) #找到a用A替换
#建议在正则表达式中，被比较的字符前面加r,不用担心转义字符的问题
a = r"\aabd-\'"
print(a)

Urlliib

打开百度，打印返回的response

response = urllib.request.urlopen("http://www.baidu.com")
print(response.read().decode("utf-8"))

获取post请求

data = bytes(urllib.parse.urlencode({"hello":"word"}),encoding="utf-8")
response = urllib.request.urlopen("http://httpbin.org/post",data = data)
print(response.read().decode("utf-8"))

获取get请求

try:
    response = urllib.request.urlopen("http://httpbin.org/get",timeout=5)
    print(response.read().decode("utf-8"))
except Exception as result:
    print(result)

打印

response = urllib.request.urlopen("http://baidu.com")
print(response.status)
print(response.getheaders())
print(response.getheader("Server"))

访问豆瓣

headers = {
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36"
           }
data = bytes(urllib.parse.urlencode({"name":"hello"}),encoding="utf-8")
url = "http://douban.com"
req = urllib.request.Request(url=url,data=data,headers=headers,method="POST")
response = urllib.request.urlopen(req)
print(response.read().decode("utf-8"))

豆瓣前250部电影列表

from bs4 import BeautifulSoup   #网页解析
import re     #正则表达式
import urllib.request,urllib.error  #制定url
import xlwt   #进行excel
import sqlite3 #进行数据库操作
def main():
    #爬取网页
    #解析数据
    #保存数据
    baseUrl = "https://movie.douban.com/top250?start="
    savePath = ".\\豆瓣评分Top250.xls"
    getData(baseUrl)

#得到指定一个URL的网友内容
def askURL(url):
    #用户代理，告诉服务器，是什么类型的机器
    header = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36"
    }
    request = urllib.request.Request(url=url,headers=header)
    html = ""
    try:
        response = urllib.request.urlopen(request)
        html = response.read().decode("utf-8")
        return  html
    except Exception as result:
        print(result)

#.单个字符
#*0-多次的重复出现
#？0次到1次

findLink = re.compile(r'<a href="(.*?)">')
findImgSrc = re.compile(r'<img.*src="(.*?)"',re.S) #忽视换行符
findTitle = re.compile(r'<span class="title">(.*)</span>')
#评分
findRating = re.compile(r'<span class="rating_num" property="v:average">(.*)</span>')
#评价人数
findCommand = re.compile(r'<span>(\d*)人评价</span>')
#概况
findInq = re.compile(r'<span class="inq">(.*)</span>')
fingBd = re.compile(r'<p class="">(.*?)</p>',re.S)
def getData(baseUrl):
    dataList = []
    for i in range(0,10):
        url = baseUrl + str(i*25)
        html = askURL(url)
        # 解析数据
        soup = BeautifulSoup(html,"html.parser")
        for item in soup.find_all('div',class_= "item"):
            #print(item)
            data = []
            item = str(item)
            link = re.findall(findLink,item)[0]
            data.append(link)
            imgSrc = re.findall(findImgSrc,item)[0]
            data.append(imgSrc)
            titles = re.findall(findTitle,item)
            if len(titles)==2:
                ctitle = titles[0]
                data.append(ctitle)
                etitle = titles[1].replace("/","")
                data.append(etitle)
            else:
                data.append(titles[0])
                data.append(' ')
            rating = re.findall(findRating,item)[0]
            data.append(rating)
            command = re.findall(findCommand,item)[0]
            data.append(command)
            inq = re.findall(findInq,item)
            if len(inq) !=0:
                inq = inq[0].replace("。","")
                data.append(inq)
            else:
                data.append(" ")

            bd = re.findall(fingBd,item)[0]
            bd = re.sub('<br(\s+)?/>(\s+)?'," ",bd) #去掉br
            bd = re.sub('/'," ",bd)
            data.append(bd.strip()) #去掉空格
            dataList.append(data)
    print(dataList)
    return dataList

def saveData():
    pass

if __name__ == "__main__":
    main()