Python爬虫实例

最新推荐文章于 2024-08-20 11:28:19 发布

Turing·

最新推荐文章于 2024-08-20 11:28:19 发布

阅读量696

点赞数 1

分类专栏： Python

本文链接：https://blog.csdn.net/qq_41835813/article/details/109587941

版权

Python 专栏收录该内容

4 篇文章 0 订阅

订阅专栏

import urllib.request
from bs4 import BeautifulSoup
import re
import xlwt
#正则表达式
findLink = re.compile(r'<a class="" href="(.*?)">')
title = re.compile(r'<span class="title">(.*?)</span>')
imgLink = re.compile(r'<img .* src="(.*?)".*/>')
daoyan = re.compile(r'<p class="">.*导演:(.*?)主.',re.S)
zhuyan = re.compile(r'<p class="">.*主.:(.*?)/.*</p>',re.S)
message = re.compile(r'<p class="">.*主..*<br/>(.*?)</p>',re.S)
pingfen = re.compile(r'<span class="rating_num".*property="v:average">(.*?)</span>',re.S)
pingjiaNum = re.compile(r'<span>(.*?)人评价</span>')
quote = re.compile(r'<span class="inq">(.*?)</span>')
data = []
def getData(baseurl):
    headers = {
        "User-Agent": "Mozilla / 5.0(Windows NT 10.0;Win64;x64;rv:82.0) Gecko / 20100101 Firefox / 82.0"
        # "User-Agent": "Mozilla / 5.0(Windows NT 10.0; WOW64) AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 65.0.3325.181 Safari / 537.36"
    }
    requset = urllib.request.Request(url=baseurl,headers=headers)
    response = urllib.request.urlopen(requset)
    return (response.read().decode("utf-8"))

def readPage(pageNum):
    f = open("doubantop250/%dpages.html"%pageNum,"r",encoding="utf-8")
    return f.read()
def bs4(htmlFile):
    #查找div下item类
    bs = BeautifulSoup(htmlFile,"html.parser")
    item = bs.find_all("div", class_="item") #class_表示为一个类
    for dy in item:
        movie = []
        dy = str(dy)
        link = re.findall(findLink,dy)[0]
        movie.append(link)
        imglink = re.findall(imgLink,dy)[0]
        movie.append(imglink)
        tit = re.findall(title,dy)
        if tit.__len__()==2:
            #中文名
            ctitle = tit[0]
            #英文名
            wtitle = str(tit[1]).replace("/","")
            elist = wtitle.split()
            wtitles = ""
            for el in elist:
                wtitles = wtitles+el+" "
            movie.append(ctitle)
            movie.append(wtitles)
        else:
            ctitle = tit[0]
            movie.append(ctitle)
            movie.append(" ")
        #导演
        daoy = re.findall(daoyan,dy)
        daoylist = str(daoy).replace(r"\xa0","")
        # daoylist.replace("[","")
        # daoylist.replace("]", "")
        # daoylist.replace("'", "")
        movie.append(daoylist)
        #主演
        zhuy = re.findall(zhuyan,dy)
        zhuy = str(zhuy).replace("<br","")
        movie.append(zhuy)

        #上映日期
        startYear = re.findall(message,dy)
        if startYear.__len__()==0:
            movie.append("")
        else:
            movie.append(str(startYear).split().pop(1)[0:4])

        #评分
        rating_num = re.findall(pingfen,dy)[0]
        movie.append(rating_num)

        #评价人数
        pjnum = re.findall(pingjiaNum,dy)[0]
        movie.append(pjnum)

        #简介
        inq = re.findall(quote,dy)
        if inq.__len__()==0:
            movie.append(" ")
        else:
            movie.append(inq[0])
        data.append(movie)

def name_is_exists(tag):
    return tag.has_attr("href")

def saves(data,path):
    book = xlwt.Workbook(encoding="UTF-8")
    sheet = book.add_sheet("豆瓣TOP250")
    col = ["详情链接","图片链接","电影名称","英文名称","导演","主演","上映时间","豆瓣评分","评论人数","概括"]
    for i in range(0,10):
        sheet.write(0,i,col[i])
    for j in range(0,250):
        one = data[j]
        print(one)
        for k in range(0,10):
           sheet.write(j+1,k,one[k])
        print(j+1)
    book.save(path)

def savePage(htmlfile,pageNum):
    f = open("doubantop250/%dpages.html"%pageNum,"a+")
    f.close()
    with open("doubantop250/%dpages.html"%pageNum,"wb") as f:
        #   写文件用bytes而不是str，所以要转码
        f.write(bytes(htmlfile.encode('utf-8')))

if __name__ == "__main__":
    baseurl = "https://movie.douban.com/top250?start="
    #爬取网页
    for i in range(0,10):
        url = baseurl+str(i*25)
        # htmlFile = getData(url)
        htmlFile = readPage(i)
        # savePage(htmlFile,i)
        print("*" , i)
    # 解析数据
        bs4(htmlFile)
    print(data)

    #保存数据
    saves(data,"D:/自编程序/studyPython/doubanTOP250.xls")