python爬取猫眼TOP100信息

获取猫眼top100信息,写入到文档中!存入到数据库!!

代码: 

import requests
import re
import json
# import pymysql
from multiprocessing import Pool
from requests.exceptions import RequestException
headers={
    "User-Agent":"Mozilla / 5.0(Windows NT 10.0;WOW64;rv:63.0) Gecko / 20100101Firefox / 63.0",
    'Host':'maoyan.com',
    'Accept':'text / html, application / xhtml + xml, application / xml;q = 0.9, * / *;q = 0.8',
    'Accept-Language':'zh - CN, zh;q = 0.8, zh - TW;q = 0.7, zh - HK;q = 0.5, en - US;q = 0.3, en;q = 0.2',
    'Accept-Encoding':'gzip, deflate',
    'Connection':'keep-alive',
    'Upgrade - Insecure - Requests':'1',
    'Cache - Control':'max - age = 0'
}
# class Sql(object):
#     conn = pymysql.connect(
#         host='127.0.0.1',
#         port=3306,
#         user='root',
#         passwd='mysql',
#         db='ccunews',
#         charset='utf8')

#     def addnews(self,movienum,moviename,moviegrade,movietime,moviemessage,moviecountry):
#         cur=self.conn.cursor()
#         cur.execute("insert into mymovie(movienum,moviename,moviegrade,movietime,moviemessage,moviecountry) values('%s','%s','%s','%s','%s','%s') "%(movienum,moviename,moviegrade,movietime,moviemessage,moviecountry))
#         lastrowid=cur.lastrowid
#         cur.close()#关闭游标
#         self.conn.commit()
#         return lastrowid

# mysql=Sql()

def get_one_page(url):
    try:

        response=requests.get(url,headers=headers)
        if response.status_code==200:
            return response.text
        return None
    except RequestException:
        return None


def parse_one_page(html):
    pattern=re.compile('<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?name"><a'+'.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>'+'.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>',re.S)
    items=re.findall(pattern,html)
    for item in items:
        yield {
            'index':item[0],
            'image':item[1].split('@')[0],
            'title':item[2],
            'actor':item[3].strip()[3:],
            'time':item[4].strip()[5:15],
            'score':item[5]+item[6],
            'country':item[4].strip()[16:-1]
        }
        # print(item[4])
        # print(item[4].strip()[5:15])
        # print(item[4].strip()[16:-1])



def write_to_file(content):

    with open('result.txt','a',encoding='utf-8')as f:
        f.write(json.dumps(content,ensure_ascii=False)+'\n')
        f.close()



def main(offset):

    url='http://maoyan.com/board/4?offset='+ str(offset)
    html=get_one_page(url)
    for item in parse_one_page(html):

        #mysql.addnews(item['index'], item['title'], item['score'], item['time'], item['actor'],item['country'])
        #print(item)
        write_to_file(item)
        




if __name__=='__main__':
    pool=Pool()
    pool.map(main,[i*10 for i in range(10)])

运行结果:

 

 

获取猫眼首页图片:

import requests
from bs4 import BeautifulSoup
import re

url="http://maoyan.com/"


headers={
    "User-Agent":"Mozilla / 5.0(Windows NT 10.0;WOW64;rv:63.0) Gecko / 20100101Firefox / 63.0",
    'Host':'maoyan.com',
    'Accept':'text / html, application / xhtml + xml, application / xml;q = 0.9, * / *;q = 0.8',
    'Accept-Language':'zh - CN, zh;q = 0.8, zh - TW;q = 0.7, zh - HK;q = 0.5, en - US;q = 0.3, en;q = 0.2',
    'Accept-Encoding':'gzip, deflate',
    'Connection':'keep-alive',
    'Upgrade - Insecure - Requests':'1',
    'Cache - Control':'max - age = 0'
}
html=requests.get(url=url,headers=headers)
print(html)
content=html.text

img=re.findall('<img data-src="(.*?)"',content)
k=0
headers2={
    'Host':'p0.meituan.net',
    "User-Agent": "Mozilla / 5.0(Windows NT 10.0;WOW64;rv:63.0) Gecko / 20100101Firefox / 63.0",
    'Accept':'text / html, application / xhtml + xml, application / xml;q = 0.9, image / webp, image / apng, * / *;q = 0.8',
    'Accept-Encoding':'gzip, deflate, br',
    'Accept-Languag':'zh - CN, zh;q = 0.9'

}
# url2="https://p0.meituan.net/movie/3e7696319c840d4890e947b926259d532809641.jpg"
# c=requests.get(url2,headers=headers2)
# # print(c)
# with open("1.jpg",'wb')as f:
#     f.write(c.content)
for i in img:
    url=i.split('@')[0]
    print(url)
    with open("maoyan/"+str(k)+".jpg",'wb') as f:
        htmlbb = requests.get(url, headers=headers2).content
        f.write(htmlbb)
    k+=1

运行结果: 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

乐亦亦乐

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值