获取评分排行100
import pymongo
import pandas as pd
client=pymongo.MongoClient('mongodb://localhost:27017/')
collection=client.ygdy.films
gt75=collection.find({'豆瓣评分':{'$gt':'7.5'}}).sort('豆瓣评分',pymongo.DESCENDING)
result = pd.DataFrame(list(gt75))[['豆瓣评分','译名','年代','上映日期','dwaddr']]
result = result.drop_duplicates().sort_values(by=['豆瓣评分','上映日期'],ascending=False)
result.to_csv('top_film.csv')
爬数据
import re
import itertools
import threadpool
import threading
import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import pymongo
from redis import StrictRedis, ConnectionPool
'''
目录
1,def req
2,def crawl_page
3,def crawl_links
4,def crawl_singel
5,def thd
6,def main
'''
'''
mongodb://localhost:27017/
client.ygdy.films
redis://@localhost:6379/1
pages
links
exlinks
newlinks
'''
client=pymongo.MongoClient('mongodb://localhost:27017/')
collection=client.ygdy.films
pool=ConnectionPool.from_url('redis://@localhost:6379/1')
redis=StrictRedis(connection_pool=pool)
ct=None
requests.packages.urllib3.disable_warnings()
s=requests.session()
s.keep_alive=False
ua=UserAgent()
def req(url):
'''
请求 返回soup
'''
response=s.get(url=url,headers={'User-Agent':ua.random},verify=False,timeout=20)
if response.status_code==200:
print('\n%s 【%s】 req is ok...'%(next(ct),url))
else:
print('\n%s 【%s】 req fail...'%(next(ct),url))
response.encoding='GBK'
return BeautifulSoup(response.text,'lxml')
def crawl_page(start_url):
'''
获取所有page链接并存如redi.pages
'''
try:
soup=req(start_url)
except Exception as e:
soup=None
print('error...',e.args)
if soup:
sor_href=soup.select('.x a')[-1].get('href')
base_href=start_url+'_'.join(sor_href.split('_')[:2])
pages=[]
for i in range(int(sor_href.split('_')[2].split('.')[0])):
pages.append(base_href+'_'+str(i+1)+'.html')
redis.sadd('pages',*pages)
def crawl_links(page):
'''
获取一个页的电影连接并存入redis.links
'''
try:
soup=req(page)
except Exception as e:
soup=None
print('error...',e.args)
if soup:
nodes=soup.select('.tbspan a:nth-of-type(2)')
links=['https://www.ygdy8.com'+node.get('href') for node in nodes]
redis.sadd('links',*links)
def crawl_singel(link):
'''
根据link爬取电影数据到mongodb,并记录操作过的地址到redis.exlinks
'''
link=link.decode('GBK')
try:
soup=req(link)
except Exception as e:
soup=None
print('error...',e.args)
if soup:
film={}
text=soup.select('#Zoom')[0].get_text()
try:
sp=re.split(r'译\s*名',text,1)
cut=sp[0][-1]
attr=('译 名'+sp[1]).split('【下载地址】')[0].strip()
except IndexError as e:
sp=re.split(r'中\s*文\s*名',text,1)
cut=sp[0][-1]
attr=('中文 名'+sp[1]).split('【下载地址】')[0].strip()
attrs=attr.split(cut)
for attr in attrs:
if attr==' ':
continue
attr=attr.replace('\u3000',' ').strip()
attr=re.sub(r'\s{2,}','',attr,1)
if attr.startswith('主演'):
attr=re.sub(r'\s{2,}','&&',attr)
if attr.startswith('简介') or attr.startswith('获奖情况'):
attr=attr.replace('简介','简介 ')
if attr.startswith('IMDB评分') or attr.startswith('IMDb评分'):
attr=attr.replace('IMDB评分','IMDB评分 ')
attr=attr.replace('IMDb评分','IMDB评分 ')
try:
sp=re.split('\s+',attr,1)
k,v=sp
except ValueError as e:
print('error on crawl_singel',sp,'no')
continue
film[k]=v
dwaddr=soup.select('#Zoom table tbody a')[0].get('href')
film['dwaddr']=dwaddr
film['link']=link
obid=collection.insert(film)
print(obid)
redis.sadd('exlinks',*(link,))
def ac_crawl_singel(link):
try:
crawl_singel(link)
except Exception as e:
print('error on ac_crawl_singel',e.args)
def thd(ctd,smember,target):
global ct
ct=itertools.count(1)
pool=threadpool.ThreadPool(ctd)
prams=[((pram,),None) for pram in iter(redis.smembers(smember))]
tasks=threadpool.makeRequests(callable_=target,args_list=prams)
[pool.putRequest(task) for task in tasks]
pool.wait()
def main():
start_urls=['https://www.ygdy8.com/html/gndy/china/',
'https://www.ygdy8.com/html/gndy/oumei/',
'https://www.ygdy8.com/html/gndy/rihan/']
global ct
print('crawl_page===')
ct=itertools.count(1)
for start_url in start_urls:
crawl_page(start_url)
print('crawl_links===')
thd(10,'pages',crawl_links)
print('crawl_singel===')
redis.delete('newlinks')
redis.sdiffstore('newlinks',['links','exlinks'])
thd(20,'newlinks',ac_crawl_singel)
if __name__=='__main__':
main()