python12（网页信息爬取）

最新推荐文章于 2024-04-03 16:03:45 发布

目三

最新推荐文章于 2024-04-03 16:03:45 发布

阅读量70

点赞数

分类专栏： Python 文章标签： python

本文链接：https://blog.csdn.net/m0_63520118/article/details/130671915

版权

Python 专栏收录该内容

12 篇文章 0 订阅

订阅专栏

#!/usr/bin/env python
# coding: utf-8

# # 一标准库urllib的应用

# In[ ]:


#（1）导入库
import urllib


# In[3]:


#（2）创建访问网页的对象
response=urllib.request.urlopen("https://www.jd.com/")


# In[4]:


#（3网页对象内容
html=response.read()


# In[5]:


#（4）打印网页内容（二进制）
print(html)


# In[15]:


#（5） 编码格式改为UTF-8
html=html.decode('UTF-8')


# In[13]:


#（6）打印网页内容（UTF-8）
print(html)


# In[16]:


#获取网址
print(response.geturl())


# In[17]:


#获取网页代码状态200表示正常；404找不到
print(response.getcode())


# In[18]:


#获取服务器响应的标头
print(response.getheaders())


# # 二.第三方库requests的应用

# In[19]:


pip show requests


# In[20]:


pip install requests


# In[21]:


#导入第三方库
import requests


# In[22]:


#提供目标网址
url="http:/jd.com/"


# In[25]:


#获取网页的数据信息并保存到变量response中
response=requests.get(url)


# In[26]:


#打印网页的原始数据状态
response.text


# In[27]:


#打印HTML
print(response.text)


# In[28]:


#猜测编码格式
response.encoding


# In[29]:


#根据网页分析编码格式
response.apparent_encoding


# In[32]:


#分析后的编码方式赋给预测编码方式
response.encoding=response.apparent_encoding


# In[33]:


#获取网页代码状态200表示正常；404找不到
response.status_code


# In[46]:


import requests
#创建伪装用户代理服务器
kv={'user-agent':'Mozilla/5.0'}
url='https://item.jd.com/10054034809904.html'
try:
    r=requests.get(url,headers=kv)
    #将分析的编码作为备选编码格式
    r.encoding=r.apparent_encoding
    #打印开始0-1000个字符
    print(r.text[:1000])
except:
    print("爬取失败！")


# # 三.获取一张图片到指定位置

# In[61]:


#导入库
import requests
from PIL import Image
import matplotlib.pyplot as plt

#创建伪装用户代理服务器
kv={'user-agent':'Mozilla/5.0'}
#图片地址
url='https://imgcps.jd.com/img-cubic/creative_server_cia_jdcloud/v2/2000366/100042368194/FocusFullshop/CkNqZnMvdDEvMTUzNzMyLzMyLzIyODgwLzMzMDA5LzY0NTk0YjdlRjg2ZTYyYTgxL2FjM2UxNTQ4OTZhYjUxOGMucG5nEgk1LXR5XzBfNTYwAjjui3pCFAoQ5LiJ5pifR2FsYXh5IFMyMxABQhAKDOeVheS6q-S8mOWTgRACQhAKDOeri-WNs-aKoui0rRAGQgoKBuWKm-iNkBAHWMLJ9df0Ag/cr/s/q.jpg'
#图片保存位置
path=r'..\R&Q_pic\pic.jpg'
try:
    #获取网页内容，将伪装代理赋给表头
    r=requests.get(url,headers=kv)
    #打开一个二进制文件（图片）
    f=open(path,'wb')
    #写入一个二进制文件
    f.write(r.content)
    #关闭写入的文件
    f.close()
    
    #文件地址
    im=Image.open(r'..\R&Q_pic\pic.jpg')
    #保存地址 
    im.save(r'..\Stu_pack\pic.jpg')
    
    #显示图片
    plt.imshow(im)
    #隐藏坐标
    plt.axis('off')
except:
    print("爬取失败！")


# # 四.第三方库beautifulsoup4的应用

# In[62]:


pip show beautifulsoup4


# In[63]:


pip install beautifulsoup4


# In[66]:


import requests
from bs4 import BeautifulSoup
url='http://jd.com'
r=requests.get(url)
r.text
print(r.text)


# In[68]:


#解析器
soup=BeautifulSoup(r.text,'html.parser')


# In[70]:


#美化
print(soup.prettify())


# In[ ]: