import requests
import re
url = 'https://www.dy2018.com'
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 11_2_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36"
}
resp = requests.get(url, headers = headers)
resp.encoding = 'gb2312'
page_content = resp.text
obj1 = re.compile(r"2020必看热片.*?<ul>(?P<movies>.*?)</ul>", re.S)
obj2 = re.compile(r"<a href='(?P<href>.*?)' title=.*?>(?P<title>.*?)</a>", re.S)
obj3 = re.compile(r'<div class=player_list>.*?<ul>.*?<li><a href="(?P<child_link_href>.*?)">', re.S)
result = obj1.finditer(page_content)
child_href_list = []
url_list = []
for i in result:
ul = i.group("movies")
result2 = obj2.finditer(ul)
for j in result2:
print(j.group("title"))
child_href = url + j.group("href")
child_href_list.append(child_href)
url_list.append(j.group("href"))
for child_href1 in child_href_list:
child_resp = requests.get(child_href1, headers=headers)
child_resp.encoding = 'gb2312'
child_page_content = child_resp.text
result3 = obj3.finditer(child_page_content)
for k in result3:
print(k.group("child_link_href"))
这是我看视频写的第一个爬虫小程序,希望看到的朋友能给优化,相互学习一下~