最近面试了一个爬虫实习岗位,不知道能不能过,先学习一下爬虫练练手.
import re
import urllib.request
import urllib.error
def get_douban_pages():
index = 1 # 电影排名
# 伪装成浏览器
header = {
'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0'
}
for i in range(0, 10): # 0~19
num = i * 25
url = 'https://movie.douban.com/top250?start=%d&filter=' % num
req = urllib.request.Request(url, headers=header)
try:
response = urllib.request.urlopen(req, timeout=1.5)
# 过滤出所有影片名(各地的影片名)
reg = r'<span class="title">.*?</span>'
reg_ques = re.compile(reg)
name_list = reg_ques.findall(response.read().decode('utf-8'))
# 过滤出需要的(大陆的影片名)影片名
name_list2 = []
reg2 = r'^(?!.* )' # 不含 的字符串,其他地方的影片名前面都是有 的
reg_ques = re.compile(reg2)
for name in name_list:
if reg_ques.match(name):
name_list2.append(name)
for name_html in name_list2:
name = name_html[20:][0:-7]
print('NO.', index, ' ', name)
index += 1
except urllib.error.URLError as e:
if hasattr(e, 'reason'):
print(e.reason)
if __name__ == "__main__":
get_douban_pages()