爬虫案例(一)

1.百度产品

## 需求:将百度全部产品页面保存到本地
import requests

response = requests.get(url='https://www.baidu.com/more/')
# print(response.text)
# print(response.content.decode()
# 保存图片
img_url = requests.get( 'https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1603206945778&di=9ea4d35a4622a99c97cc398a5fede3c5&imgtype=0&src=http%3A%2F%2Ffile02.16sucai.com%2Fd%2Ffile%2F2014%2F0419%2Fd9f4710e211cd8bce6b8ef361b805fd3.jpg')
with open('songshu.jpg','wb', ) as f:
    f.write(img_url.content)

2.新浪新闻

#需求将新浪新闻保存至本地
# 1.导包
import  requests

# 定义网址
base_url='http://search.sina.com.cn/'
#定义参数
param={
'q': 'java',
'c': 'news',
'from': 'index',
}
# 2.确定请求方式跟URL
response = requests.get(url=base_url,params=param)
print(response.url)
# 保存本地
# with open('xinlang.html','w',encoding='utf8') as  fp:
#     fp.write(response.text)

3.百度搜索

#需求 用户想要查询什么页面,就将该页面保存至本地
import  requests

wd=input('请输入想搜索的内容:')
base_url='https://www.baidu.com/s'
param={
 'wd': wd,
}
head={
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'
}
response=requests.get(url=base_url,params=param,headers=head)
print(response.url)
# print(response.request.headers)
with open('python.html','w',encoding='utf8') as fp:
    fp.write(response.text)

4.虎扑新闻

#需求  获取虎扑新闻前五页内容,并保存至本地

import  requests
#找规律URL   https://voice.hupu.com/news?category=all&page=1
#            https://voice.hupu.com/news?category=all&page=2
#           https://voice.hupu.com/news?category=all&page=3
#           https://voice.hupu.com/news?category=all&page=4
#           https://voice.hupu.com/news?category=all&page=5
head={
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'
}

params={
    'category':'all'
}
for page in range (1,6):
    params['page']=page
    response = requests.get(url='https://voice.hupu.com/',params=params,headers=head)
    with open(f'hupu{page}.html','w',encoding='utf8') as fp:
        fp.write(response.text)
      

5.百度翻译

#需求 用户输入什么名字可以查询相应的所有翻译
import  requests

kw=input('查询:')
headres={
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36',
'x-requested-with': 'XMLHttpRequest'
}
data={
    'kw': kw
}
response = requests.post(url='https://fanyi.baidu.com/sug',data=data)
# print(response.json())
# print(type(response.json()))

# 获取json数据
data_list=response.json()
print(data_list)
# print("------------------------------------------------------------")

# 获取json里的data数据
content=data_list['data']
# print(content)

# 遍历数据
for i in content:
    # print(i)
    a=i['k']
    b=i['v']
    print(a,b)

6.百度贴吧

import  requests
# 分页规律
# 第一页  https://tieba.baidu.com/f?kw=lol&ie=utf-8&pn=0
# 第二页  https://tieba.baidu.com/f?kw=lol&ie=utf-8&pn=50
# 第三页  https://tieba.baidu.com/f?kw=lol&ie=utf-8&pn=100
# 第四页  https://tieba.baidu.com/f?kw=lol&ie=utf-8&pn=150
# 第五页  https://tieba.baidu.com/f?kw=lol&ie=utf-8&pn=200
headers={
 'user-agent'': ''Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'',
}
param={
 'ie'': ''utf-8',
 'kw'': ''LOL',
}
for i in range (1,11):
    param['pn']=(i-1)*50
    response= requests.get(url='https://tieba.baidu.com/f',params=param,headers=headers)
    with open(f'tieba{i}.html','w',encoding='utf8') as  fp:
        fp.write(response.text)

7.小程序社区

#需求: 获取前10页数据
import requests

params= {
 'mod': 'list',
 'catid': '1',
}
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'
}
for page in range(1,11):
    params['page']=page
    response = requests.get(url='http://www.wxapp-union.com/portal.php',params=params,headers=headers)
    print(response.url)
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值