# 目标url:https://52online.vip/list/?1.html
import requests
from bs4 import BeautifulSoup
import time
import csv
# 请求访问
def request_url(ye):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'}
url = 'https://52online.vip/list/?1-' + str(ye) + '.html'
while True:
time.sleep(3)
r = requests.get(url, headers=headers, verify=False)
if r.status_code == 200:
print('访问成功:', url)
return r.text
# 抓取网页数据
def url_texts(url_text):
'''
:param url_text: 响应的url.text
:return: urls=电影对应的网址
dates=上映日期和类型
hds=清晰度
names=电影名
'''
url_text = url_text
urs = 'https://52online.vip'
urls = [] # 电影url
dates = [] # 上映日期
guojia = [] # 国家
leixing = [] # 类型
hds = [] # 清晰度
names = [] # 电影名字
soup = BeautifulSoup(url_text,'lxml')
for i in soup.find_all('li',class_='col-lg-8 col-md-6 col-sm-4 col-xs-3'):
# url
for url in i.find_all('a'):
# print(url.get('href'))
urls.append(urs + url.get('href').strip())
# 上映日期和类型
for y in i.select('p'):
# print(y.get_text().strip())
dates.append(y.get_text().strip()[:4]) # 上映时间
guojia.append(y.get_text().strip()[5:7]) # 国家
leixing.append(y.get_text().strip()[-3:]) # 类型
# 清晰度
for y in i.select('span.pic-text'):
# print(y.get_text().strip())
hds.append(y.get_text().strip())
# 名字
for y in i.find_all('a',class_='myui-vodlist__thumb lazyload'):
# print(y.get('title'))
names.append(y.get('title').strip())
return names, dates, guojia, leixing, hds, urls
# 储存成csv
def csv_text(number):
text_csv = 'D:\IT\电影.csv'
for i in range(1,int(number)+1):
n, d, g, l, h, u = url_texts(request_url(i))
for ns, ds, gs, ls, hs, us in zip(n, d, g, l, h, u):
with open(text_csv, 'a+', newline='') as f:
writer = csv.writer(f)
writer.writerow([ns, ds, gs, ls, hs, us])
def main():
number = input('请输出爬取多少页:')
csv_text(number) # 储存成csv
# n, d, g, l, h, u = url_texts(request_url(number))
# print(g);print(l)
if __name__ == '__main__':
main()
运行: