目录
一、任务描述
爬虫任务:实现多线程,使用函数形式多线程爬取网页内容
二、任务网站描述
三、运行结果及说明
1.声明此次需要导的包以及此次爬取信息的网站
2.函数写第一个进程爬取猫眼电影排行榜电影名称
3.函数写第二个进程爬取猫眼电影排行榜主演
4.多线程运行
5.最终结果
四、源码
import requests
import re
from bs4 import BeautifulSoup
from urllib.error import HTTPError
from urllib.error import URLError
from bs4 import BeautifulSoup
import requests
html =猫眼验证中心
def get_name(thread_name,html):
try:
headers={
"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36",
}
resp=requests.get(html,headers = headers)
html = BeautifulSoup(resp.content,'html.parser')
dds = html.findAll('dd')
name = []
#string编码格式输出
for dd in dds:
#电影名称
name1 = dd.find('p',{'class':'name'}).string
print("电影:" + name1)
name.append(name1)
except HTTPError as e:
print(e)
except URLError as e:
print('The server could not be found')
else:
print('It Worked!')
return name
def get_stars(thread_name,html):
try:
headers={
"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36",
}
resp=requests.get(html,headers = headers)
html = BeautifulSoup(resp.content,'html.parser')
dds = html.findAll('dd')
stars = []
#string编码格式输出
for dd in dds:
#主演
star = dd.find('p',{'class':'star'}).string
print(star)
stars.append(star)
except HTTPError as e:
print(e)
except URLError as e:
print('The server could not be found')
else:
print('It Worked!')
return stars
import _thread
_thread.start_new_thread(get_name, ('Thread 1', html))
_thread.start_new_thread(get_stars, ('Thread 2', html))