目标网址:猫眼电影 猫眼验证中心
目标信息:猫眼电影网址相关信息如电影名称、评分、类型等等
保存方式:文本文档
Xpath:
from lxml import etree
from selenium import webdriver
from time import sleep
urls=[]
url_nums=[]
names = []
cls = []
srs = []
countrys = []
times = []
driver = webdriver.Chrome()
def get_url():
url = 'https://maoyan.com/films?showType=3&offset=0'
driver.get(url)
sleep(1)
html = driver.page_source
txt = html.encode("utf8")
ht = etree.HTML(txt,parser=etree.HTMLParser(encoding="utf8"))
url = ht.xpath('//ul/li/a/@href')[11:78:]
clss = ht.xpath('//ul/li/a')[11:78:]
for i in range(len(url)):
ls=[]
ls.append(url[i])
ls.append(clss[i].text)
urls.append(ls)
def d_url():
url = 'https://maoyan.com/films'
print('有以下电影类型可供爬取:')
for i in urls:
print(i[1])
cl = input('输入你想爬取的电影类型:')
for i in urls:
if cl==i[1]:
url = url+i[0]
break
return url
def get_html(url):
driver.get(url)
sleep(0.5)
html = driver.page_source
txt = html.encode("utf8")
return txt
def get_furl(txt):
html = etree.HTML(txt,parser=etree.HTMLParser(encoding="utf8"))
films = html.xpath('//dd/div/div/a/@href')
scrs= html.xpath('//dd/div/i')
for i in range(len(films)):
url_nums.append(films[i])
for i in range(0,len(scrs),2): #获取评分
srs.append((scrs[i].text+scrs[i+1].text))
def get_massage(url):
driver.get(url)
sleep(5)
html = driver.page_source
txt = html.encode("utf8")
html = etree.HTML(txt,parser=etree.HTMLParser(encoding="utf8"))
name = html.xpath('//div/div/h1')[0].text #获取电影的名称
names.append(name)
s = html.xpath('/html/body/div[3]/div/div[2]/div[1]/ul/li[1]/a') #获取电影的类型
cls.append(s)
st = html.xpath('/html/body/div[3]/div/div[2]/div[1]/ul/li[2]')[0].text #获取电影出品国家
countrys.append(st.replace('\n',''))
t = html.xpath('/html/body/div[3]/div/div[2]/div[1]/ul/li[3]')[0].text #获取电影上映时间
times.append(t)
if __name__ == '__main__':
get_url()
url = d_url()
for i in range(1):
txt = get_html((url+"&offset="+str(i*30)).replace('amp;',''))
get_furl(txt)
flag=0
for i in url_nums:
flag+=1
print("%.2f%%"%((flag/len(url_nums))*100))
furl = 'https://maoyan.com'+i
get_massage(furl)
file=open('demopath.txt','w',encoding='utf-8')
for i in range(len(names)):
cl1 = ''
for j in cls[i]:
cl1=cl1+j.text+' '
file.write(names[i]+" "*4+cl1+" "*4+srs[i]+countrys[i]+times[i]+"\n")
Beautiful soup4 :
import re
from selenium import webdriver
from time import sleep
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
ua = UserAgent(path='D:/fakeuseragent.json/fake_useragent_0.1.11.json')
urls=[]
url_nums=[]
names = []
cls = []
srs = []
countrys = []
times = []
headers = {'User-Agent':ua.random}
driver = webdriver.Chrome()
def get_url():#所有类型url
url = 'https://maoyan.com/films?showType=3&offset=0'
driver.get(url)
sleep(1)
html = driver.page_source
txt = html.encode("utf8")
soup = BeautifulSoup(txt,'lxml')
t = soup.select('a[data-act="tag-click"]')
for i in t[1::]:
s= re.match('''<a data-act="tag-click" data-val="{TagName:'(.*)'}" href="(.*)">(.*)</a>''',str(i)).groups()
urls.append(s)
def get_durl():#输入类型的url
url = 'https://maoyan.com/films'
print('有以下电影类型可供爬取:')
for i in urls:
print(i[0])
cl = input('输入你想爬取的电影类型:')
for i in urls:
if cl==i[0]:
url = url+i[1]
break
return url
def get_html(url):#获得爬取页面的html
driver.get(url)
sleep(0.5)
html = driver.page_source
txt = html.encode("utf8")
return txt
def get_fln(txt):#获得所需爬取的所有电影的url
soup = BeautifulSoup(txt,'lxml')
t = soup.select('a[data-act="movies-click"]')
sr = soup.select('div[class="channel-detail channel-detail-orange"]')
for i in t[1::]: #获得所需爬取的电影的url
s= re.match('<a data-act="movies-click" data-val="{movieId:(.*)}" href=',str(i)).group(1)
url_nums.append(s)
for i in sr: #获取所需爬取电影的评分
srs.append(i.text)
#爬取某电影的信息
def get_massage(url):
driver.get(url)
sleep(5)
html = driver.page_source
txt = html.encode("utf8")
soup = BeautifulSoup(txt,'lxml')
f_name=soup.select('h1[class="name"]')
names.append(f_name[0].text) #电影名
cl = soup.select('li[class="ellipsis"]')
cls.append(cl[0].text.replace('\n','')) #电影类型
countrys.append(cl[1].text.replace('\n','')) #国家
times.append(cl[2].text.replace('\n','')) #上映时间
if __name__ == '__main__':
get_url()
url = get_durl()
for i in range(10):
txt = get_html((url+"&offset="+str(i*30)).replace('amp;',''))
get_fln(txt)
flag=0
for i in url_nums:
flag+=1
print("%.2f%%"%((flag/len(url_nums))*100))
furl = 'https://maoyan.com/films/'+i
get_massage(furl)
file=open('demo2.txt','w',encoding='utf-8')
for i in range(len(names)):
file.write(names[i]+" "*4+cls[i]+" "*4+srs[i]+countrys[i]+times[i]+"\n")