python爬取猫眼TOP100信息

最新推荐文章于 2021-10-15 17:58:16 发布

乐亦亦乐

最新推荐文章于 2021-10-15 17:58:16 发布

阅读量528

点赞数 1

分类专栏： python爬虫文章标签：猫眼电影 top100 python爬虫

本文链接：https://blog.csdn.net/qq_41251963/article/details/86694786

版权

python爬虫专栏收录该内容

15 篇文章 5 订阅

订阅专栏

获取猫眼top100信息，写入到文档中！存入到数据库！！

代码：

import requests
import re
import json
# import pymysql
from multiprocessing import Pool
from requests.exceptions import RequestException
headers={
    "User-Agent":"Mozilla / 5.0(Windows NT 10.0;WOW64;rv:63.0) Gecko / 20100101Firefox / 63.0",
    'Host':'maoyan.com',
    'Accept':'text / html, application / xhtml + xml, application / xml;q = 0.9, * / *;q = 0.8',
    'Accept-Language':'zh - CN, zh;q = 0.8, zh - TW;q = 0.7, zh - HK;q = 0.5, en - US;q = 0.3, en;q = 0.2',
    'Accept-Encoding':'gzip, deflate',
    'Connection':'keep-alive',
    'Upgrade - Insecure - Requests':'1',
    'Cache - Control':'max - age = 0'
}
# class Sql(object):
#     conn = pymysql.connect(
#         host='127.0.0.1',
#         port=3306,
#         user='root',
#         passwd='mysql',
#         db='ccunews',
#         charset='utf8')

#     def addnews(self,movienum,moviename,moviegrade,movietime,moviemessage,moviecountry):
#         cur=self.conn.cursor()
#         cur.execute("insert into mymovie(movienum,moviename,moviegrade,movietime,moviemessage,moviecountry) values('%s','%s','%s','%s','%s','%s') "%(movienum,moviename,moviegrade,movietime,moviemessage,moviecountry))
#         lastrowid=cur.lastrowid
#         cur.close()#å…³é—æ¸¸æ ‡
#         self.conn.commit()
#         return lastrowid

# mysql=Sql()

def get_one_page(url):
    try:

        response=requests.get(url,headers=headers)
        if response.status_code==200:
            return response.text
        return None
    except RequestException:
        return None


def parse_one_page(html):
    pattern=re.compile('<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?name"><a'+'.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>'+'.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>',re.S)
    items=re.findall(pattern,html)
    for item in items:
        yield {
            'index':item[0],
            'image':item[1].split('@')[0],
            'title':item[2],
            'actor':item[3].strip()[3:],
            'time':item[4].strip()[5:15],
            'score':item[5]+item[6],
            'country':item[4].strip()[16:-1]
        }
        # print(item[4])
        # print(item[4].strip()[5:15])
        # print(item[4].strip()[16:-1])



def write_to_file(content):

    with open('result.txt','a',encoding='utf-8')as f:
        f.write(json.dumps(content,ensure_ascii=False)+'\n')
        f.close()



def main(offset):

    url='http://maoyan.com/board/4?offset='+ str(offset)
    html=get_one_page(url)
    for item in parse_one_page(html):

        #mysql.addnews(item['index'], item['title'], item['score'], item['time'], item['actor'],item['country'])
        #print(item)
        write_to_file(item)
        




if __name__=='__main__':
    pool=Pool()
    pool.map(main,[i*10 for i in range(10)])

运行结果：

获取猫眼首页图片：

import requests
from bs4 import BeautifulSoup
import re

url="http://maoyan.com/"


headers={
    "User-Agent":"Mozilla / 5.0(Windows NT 10.0;WOW64;rv:63.0) Gecko / 20100101Firefox / 63.0",
    'Host':'maoyan.com',
    'Accept':'text / html, application / xhtml + xml, application / xml;q = 0.9, * / *;q = 0.8',
    'Accept-Language':'zh - CN, zh;q = 0.8, zh - TW;q = 0.7, zh - HK;q = 0.5, en - US;q = 0.3, en;q = 0.2',
    'Accept-Encoding':'gzip, deflate',
    'Connection':'keep-alive',
    'Upgrade - Insecure - Requests':'1',
    'Cache - Control':'max - age = 0'
}
html=requests.get(url=url,headers=headers)
print(html)
content=html.text

img=re.findall('<img data-src="(.*?)"',content)
k=0
headers2={
    'Host':'p0.meituan.net',
    "User-Agent": "Mozilla / 5.0(Windows NT 10.0;WOW64;rv:63.0) Gecko / 20100101Firefox / 63.0",
    'Accept':'text / html, application / xhtml + xml, application / xml;q = 0.9, image / webp, image / apng, * / *;q = 0.8',
    'Accept-Encoding':'gzip, deflate, br',
    'Accept-Languag':'zh - CN, zh;q = 0.9'

}
# url2="https://p0.meituan.net/movie/3e7696319c840d4890e947b926259d532809641.jpg"
# c=requests.get(url2,headers=headers2)
# # print(c)
# with open("1.jpg",'wb')as f:
#     f.write(c.content)
for i in img:
    url=i.split('@')[0]
    print(url)
    with open("maoyan/"+str(k)+".jpg",'wb') as f:
        htmlbb = requests.get(url, headers=headers2).content
        f.write(htmlbb)
    k+=1

运行结果：