1 #https://www.domp4.com/list/6-1.html 2 import requests 3 import re 4 from bs4 import BeautifulSoup 5 from urllib.parse import urlparse,parse_qs 6 import os 7 8 9 def get_url_content(url): //获取网站的源码 10 response=requests.get(url) 11 if response.status_code==200: 12 return response.text 13 else: 14 return False 15 16 def parse_Web_Content(content): 17 Object=BeautifulSoup(content,'html.parser') 18 19 filmName=get_film_name(Object) 20 filmCast=get_film_cast(Object) 21 filmIntro=get_film_introduction(Object) 22 filmUrl=get_film_url(Object) 23 24 film=[] 25 for i in range(len(filmName)): 26 indiv={ 27 'fileName':filmName[i], 28 'filmCast':filmCast[i], 29 'filmIntro':filmIntro[i], 30 'filmurl':'https://www.domp4.com'+filmUrl[i] 31 } 32 film.append(indiv) 33 return film 34 35 36 def get_film_name(Soup): 37 Name=Soup.select(".play_info") 38 name_list=[] 39 for i in range(len(Name)): 40 parsedName=Name[i].a.string 41 name_list.append(parsedName) 42 return name_list 43 44 def get_film_cast(Soup): 45 Cast=Soup.find_all('p',attrs={'class':'space'}) 46 film_Cast = [] 47 for i in range(len(Cast)): 48 parsedCast=Cast[i].text 49 film_Cast.append(parsedCast) 50 return film_Cast 51 52 def get_film_introduction(Soup): 53 Introduction=Soup.find_all('p',attrs={'class':'content'}) 54 intro_list=[] 55 for i in range(len(Introduction)): 56 parsedIntro=Introduction[i].text 57 intro_list.append(parsedIntro) 58 return intro_list 59 60 def get_film_url(Soup): 61 62 filmUrl=Soup.select(".play_info") 63 Url_list=[] 64 for i in range(len(filmUrl)): 65 href=filmUrl[i].a['href'] 66 Url_list.append(href) 67 return Url_list 68 69 def writeTofile(parsedWebcontent): 70 with open('film.txt','a',encoding='utf-8') as f: 71 for i in range(len(parsedWebcontent)): 72 f.write(parsedWebcontent[i]['fileName']+'\t') 73 f.write(parsedWebcontent[i]['filmCast'] + '\t') 74 f.write(parsedWebcontent[i]['filmIntro'] + '\t') 75 f.write(parsedWebcontent[i]['filmurl'] + '\t') 76 f.write('\n') 77 f.close() 78 79 80 link="https://www.domp4.com/list/6-" 81 for i in range(1,4): 82 url=link + str(i) + ".html" 83 webContent=get_url_content(url) 84 85 if webContent!=False: 86 Content=parse_Web_Content(webContent) 87 writeTofile(Content)