获取一个网页内容的标准方法:
import requests
def zpx(url):
try:
r=requests.get(url,timeout=30)
r.raise_for_status() //如果状态不是200引发异常
r.encoding=r.apparent_encoding
return r.text
except:
return "产生异常"
url=("http://www.baidu.com")
print(zpx(url))
爬取京东商品
>>> import requests as a
>>> r=a.get("https://item.jd.com/7408023.html")
>>> r.status_code
200
>>> r.encoding
'gbk'
>>> r.text[:1000]
全代码如下:
import requests
url="https://item.jd.com/7408023.html"
try:
r=requests.get(url)
r.raise_for_status()
r.encoding=r.apparent_encoding
print(r.text[:1000])
except:
print("爬取失败")
爬取亚马逊商品
>>> url="https://www.amazon.cn/dp/B07BB5N86W/ref=gwgfloorv1_SOFTLINE_a_1?pf_rd_p=50f92a02-a3b8-4f02-8291-45f56297c423&pf_rd_s=desktop-6&pf_rd_t=36701&pf_rd_i=desktop&pf_rd_m=A1AJ19PSB66TGU&pf_rd_r=9ECNVF0WDYQMWZSM82DH&pf_rd_r=9ECNVF0WDYQMWZSM82DH&pf_rd_p=50f92a02-a3b8-4f02-8291-45f56297c423"
>>> r.status_code
503
>>> r.request.headers
{'User-Agent': 'python-requests/2.19.1', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive'}
>>> r=requests.get(url,headers={'User-Agent':'Mozilla/5.0'}) //变为火狐浏览器
>>> r.request.h
Traceback (most recent call last):
File "<pyshell#13>", line 1, in <module>
r.request.h
AttributeError: 'PreparedRequest' object has no attribute 'h'
>>> r.request.headers
{'User-Agent': 'Mozilla/5.0', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive'}
>>> r.text[:1000] //输出后1000个
爬取亚马逊代码:
import requests as a
url="https://www.amazon.cn/dp/B00QJDOLIO/ref=lp_1536596071_1_1?s=amazon-devices&ie=UTF8&qid=1539086987&sr=1-1"
try:
zpx={'user-agent':'Mozilla/5.0'}
r=a.get(url,headers=zpx)
r.raise_for_status()
r.encoding=r.apparent_encoding
print(r.text[1000:2000])
except:
print("爬取失败")
百度的关键词接口:
http://www.baidu.com/s?wd=keyword
360的关键词接口:
http://www.so.com/s?q=keyword
自动用百度360搜搜:
>>> import requests as a
>>> zpx={'wd':'Python'}
>>> r=a.get("http://www.baidu.com/s",params=zpx)
>>> r.status_code
200
>>> r.request.url
'http://www.baidu.com/s?wd=Python'
>>> len(r.text)
253870
百度搜索360搜索全代码:
import requests
keyword="张鹏旭"
try:
zpx={'wd':keyword}
r=requests.get("http://www.baidu.com/s",params=zpx)
print(r.request.url)
r.raise_for_status()
print(len(r.text))
except:
print("爬取失败")
爬取网络图片:
>>> url3="http://img.netbian.com/file/2018/1005/cf0b972b88bc7b122412af86f541135c.jpg"
>>> r=a.get(url3)
>>> r.status_code
200
>>> path2="//Users//zhangpengxu//Desktop//223.jpg"
>>> with open(path2,'wb') as f:
f.write(r.content)
208143
>>> r.close()
>>>
爬取图片标准代码:
import requests as a
import os
url="http://img.netbian.com/file/2018/0719/00fb820dcb32d4f18ce306a91eccceaf.jpg"
root="//Users//zhangpengxu//Desktop//"
path=root+url.split('/')[-1] #截取url后的名字
try:
if not os.path.exists(root):
os.mkdir(root) #要创建的目录
if not os.path.exists(path):
r=a.get(url)
with open(path,'wb') as f:
f.write(r.content)
f.close()
print("文件保存成功")
else:
print("文件已存在")
except:
print("爬取失败")
IP地址查询全代码:
import requests as a
url="http://m.ip138.com/ip.asp?ip="
try:
r=a.get(url+"202.204.80.112")
r.raise_for_status()
r.encoding=r.apparent_encoding
print(r.text[-500:])
except:
print("爬取失败")
生成英语词云:
#导入词云的包
from wordcloud import WordCloud
#导入matplotlib作图的包
import matplotlib.pyplot as plt
#读取文件,返回一个字符串,使用utf-8编码方式读取,该文档位于此python同以及目录下
f = open(u'//Users//zhangpengxu//Desktop//123.txt','r',encoding='utf-8').read()
#生成一个词云对象
wordcloud = WordCloud(
background_color="white", #设置背景为白色,默认为黑色
width=1500, #设置图片的宽度
height=960, #设置图片的高度
margin=10 #设置图片的边缘
).generate(f)
# 绘制图片
plt.imshow(wordcloud)
# 消除坐标轴
plt.axis("off")
# 展示图片
plt.show()
# 保存图片
wordcloud.to_file('//Users//zhangpengxu//Desktop//123.png')
简单的生成词云:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import jieba
path_text='//Users//zhangpengxu//Desktop//123.txt'
f=open(path_text,"r",encoding='UTF-8').read()
cut_text=" ".join(jieba.cut(f))
wordcloud=WordCloud(
font_path='//Library//Fonts//Xingkai.ttc',
background_color="white",
width=1000,height=880,
).generate(cut_text)
plt.imshow(wordcloud,interpolation="none")
'''- interplotation:默认"None",可用字符串类型命令设定
可设定的字符串命令为:'none','nearest','bilinear','bicubic',‘spline16',
'spline36', 'hanning', 'hamming', 'hermite', 'kaiser','quadric',
'catrom','gaussian','bessel','mitchell', 'sinc','lanczos'
'''
plt.axis("off")
plt.show()
高级一点的生成词云
from wordcloud import WordCloud,ImageColorGenerator
import matplotlib.pyplot as plt
import numpy as np
import jieba
from PIL import Image
ls=["校青协宣传部部长","负责海报制作及新媒体宣传方向","校优秀干事","校优秀志愿者","最美宜阳马拉松大赛奖牌","英语四级","驾驶证","操作广联达","CAD软件","maka","沈阳建联造价咨询公司实习","性格开朗","团队意识","善于与他人沟通","法库县村村通工程","北陵公园公厕","现场测量"]
for items in ls:
jieba.add_word(items)
path_text='//Users//zhangpengxu//Desktop//123.txt'
path_img = "//Users//zhangpengxu//Desktop//6666.jpg"
background_image = np.array(Image.open(path_img))
image_color=ImageColorGenerator(background_image)
f = open(path_text,"r",encoding='UTF-8').read()
cut_text=" ".join(jieba.cut(f))
wordcloud=WordCloud(
font_path='//Library//Fonts//Xingkai.ttc',
background_color="white",
mask=background_image
).generate(cut_text)
plt.imshow(wordcloud.recolor(color_func=image_color),interpolation="bilinear")
'''- interplotation:默认"None",可用字符串类型命令设定
可设定的字符串命令为:'none','nearest','bilinear','bicubic',‘spline16',
'spline36', 'hanning', 'hamming', 'hermite', 'kaiser','quadric',
'catrom','gaussian','bessel','mitchell', 'sinc','lanczos'
'''
plt.axis("off")
plt.show()
wordcloud.to_file("//Users//zhangpengxu//Desktop//zhangpengxu.jpg")