爬虫入门实例十

import urllib.request
from http import cookiejar
import requests
from bs4 import BeautifulSoup

def get_html(url):
    try:
        h = {
            'Cookie':'bid=JoOO5Fbfy_U; ll="118222"; dbcl2="193741190:O0l7hIScuG0"; __yadk_uid=5Vr8NOIhZyajVLUbw4QRMLcikPJe2WI7; ap_v=0,6.0; ck=DzWi; _pk_ref.100001.8cb4=%5B%22%22%2C%22%22%2C1553257668%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DudVIRnaKEQc_BZ87WGydbrGiqh4xQUpVcfWD7qk8KTC%26wd%3D%26eqid%3Dd77e0eb40004fdf0000000025c94d4bf%22%5D; _pk_id.100001.8cb4=0c5ba53798789183.1553170026.3.1553257668.1553255348.; _pk_ses.100001.8cb4=*; push_noty_num=0; push_doumail_num=0; __utmt=1; __utma=30149280.1854474263.1552918691.1553255332.1553257668.4; __utmb=30149280.2.10.1553257668; __utmc=30149280; __utmz=30149280.1553257668.4.4.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmv=30149280.19374\
Host:www.douban.com',
            'User-Agent':"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0"
        }
        r = requests.get(url, headers=h)
        if r.status_code == 200:
            return r.text
    except:
        return None

def html_parser(html):
    soup = BeautifulSoup(html, "html.parser")
    for link in soup.find_all('a'):
        print(link.get('href'))

def main():
    url = 'https://www.douban.com/'
    html = get_html(url)
    html_parser(html)

main()
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值