【Python爬虫】爬取猫眼电影TOP100榜

最新推荐文章于 2024-03-28 07:33:11 发布

Renyan20

最新推荐文章于 2024-03-28 07:33:11 发布

阅读量1.4k

点赞数 1

文章标签： Python 爬虫电影

本文链接：https://blog.csdn.net/qq_40124134/article/details/83033046

版权

import requests
import bs4
from bs4 import BeautifulSoup
url = 'http://maoyan.com/board/4'
path = 'I://Users//xieyingchao//Desktop//爬虫//movies.txt'
header = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'Accept-Encoding': 'gzip, deflate',
    'Accept-Language': 'zh-CN,zh;q=0.9',
    'Cache-Control': 'max-age=0',
    'Connection': 'keep-alive',
    'Host': 'maoyan.com',
    'Referer': 'http://maoyan.com/board',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.349'
}
def GetText(url,header,offset):
    #get网页源代码，猫眼电影榜每十个一夜通过params来切换页码，每一页的URL的不同在于“offset=0”，构造头模拟浏览器，猫眼disallow一般的爬虫。
    try:
        r = requests.get(url,params= 'offset='+offset,headers = header)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return ""

def Text2html(text,ulist):
    #做soup，将网页源码转换成HTML。
    soup = BeautifulSoup(text,'html.parser')
    for dds in soup.find_all('dd'): #观察源代码可以发现电影信息隐藏在dd标签下，find_all筛选出所有的dd
        if isinstance(dds,bs4.element.Tag): #判断筛选出的dd是不是标签类型
            a = dds.getText().replace('\n\n\n\n\n\n\n\n',' ').replace('                ','').replace('\n\n\n\n','\n')
            #getText获取标签内的文本，不会使用正则表达式，连续的replace同样可以去除多余的换行符和空格。
            ulist.append([a]) #list的嵌套每一个电影信息是一个内部list
            #for ps in dds.find_all('p'):
               # ulist[k].append(ps.getText())

def Save2txt(ulist,path): 
    f = open(path,'w',encoding = 'UTF-8') #文件写入用“UTF-8”码，否则报UnicodeEncodeError错误
    k = len(ulist)
    for i in range(k):
        f.writelines(ulist[i])
    f.close()

def main():
    ulist = []
    for i in range(10):
        offset = str(i*10)
        #r = requests.get(url, params='offset=' + offset, headers=header)
        #print(r.url)
        r = GetText(url,header,offset)
        Text2html(r,ulist)
    Save2txt(ulist,path)
    print(ulist)
main()

爬取结果示例：