21day数据网站爬取作业

数据网站爬取

网站收集来自https://www.zhihu.com/question/19614805
import requests
from bs4 import BeautifulSoup
import re
import csv
from requests import HTTPError
import time
url = r'https://www.zhihu.com/question/19614805'
header = {
    # 'cookie': '',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36'
}
f = requests.get(url, headers=header)
file = f.text
txt = BeautifulSoup(file, 'html.parser')
a = txt.find_all('a', class_="wrap external")
urls = []
name = []
for i in a[1:]:
    urls.append(':'.join(re.split(r'=|%3A', i.get("href"))[1:]))
    name.append(i.getText())
mode = []
for i in urls:
    try:
        url = i
        header = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36'
        }
        f = requests.get(url, headers=header, timeout=10)
        mode.append(f.status_code)
    except:
        mode.append('无法访问')
        print(i)
dicts = []
for i in range(len(urls)):
    dicts.append({"url": urls[i], "name": name[i], "mode": mode[i]})
f = csv.DictWriter(open(r'D:\qianfengstudy\pac\url.csv', 'w', encoding='utf-8', newline=''), ['url', 'name', 'mode'])
f.writeheader()
f.writerows(dicts)
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值