数据网站爬取
网站收集来自https://www.zhihu.com/question/19614805
import requests
from bs4 import BeautifulSoup
import re
import csv
from requests import HTTPError
import time
url = r'https://www.zhihu.com/question/19614805'
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36'
}
f = requests.get(url, headers=header)
file = f.text
txt = BeautifulSoup(file, 'html.parser')
a = txt.find_all('a', class_="wrap external")
urls = []
name = []
for i in a[1:]:
urls.append(':'.join(re.split(r'=|%3A', i.get("href"))[1:]))
name.append(i.getText())
mode = []
for i in urls:
try:
url = i
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36'
}
f = requests.get(url, headers=header, timeout=10)
mode.append(f.status_code)
except:
mode.append('无法访问')
print(i)
dicts = []
for i in range(len(urls)):
dicts.append({"url": urls[i], "name": name[i], "mode": mode[i]})
f = csv.DictWriter(open(r'D:\qianfengstudy\pac\url.csv', 'w', encoding='utf-8', newline=''), ['url', 'name', 'mode'])
f.writeheader()
f.writerows(dicts)