import requests from bs4 import BeautifulSoup def getHtml(url): try: r = requests.get(url,timeout = 30) r.raise_for_status() r.encoding = 'gbk' return r.text except: return '' def saveInfo(html): soup=BeautifulSoup(html,'html.parser') move_ls = soup.find('ul',class_='picList clearfix') movies = move_ls.find_all('li') for top in movies: img_url = top.find('img')['src'] name = top.find('span',class_='sTit').get_text() try: time = top.find('span',class_='sIntro').get_text() except: time = '暂时无上映时间信息' actors_tag = top.find('p',class_='pActor') actors = [] if actors_tag: for act in actors_tag.contents: if act.string: actors.append(act.string) else: actors = ['暂时无演员姓名'] intro = '' if top.find('p',class_='pTxt pIntroHide'): intro = top.find('p',class_='pTxt pIntroHide').get_text() else: intro = top.find('p',class_='pTxt pIntroShow').get_text() print('影片名:{}\t{}\n{}\n{}\n\n'.format(name,time,actors,intro)) with open('D:/movie/'+name + '.jpg','wb+') as f: img_url="http:"+img_url imgdata = requests.get(img_url).content f.write(imgdata) def main(): url = 'https://tv.2345.com/top/hot.html' html = getHtml(url) saveInfo(html) main()
2345电影爬取
于 2023-09-14 15:18:24 首次发布