python-requests(三)

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/mango_haoming/article/details/70390944

requests库爬虫实例

1、爬取京东网页

>>> import requests
>>> r = requests.get("http://item.jd.com/3434759.html")
>>> r.status_code
200
>>> r.encoding
'gbk'
>>> r.text[:1000]
'<!-- shouji -->\n<!DOCTYPE HTML>\n<html lang="zh-CN">\n<head>\n    <meta http-equiv="Content-Type" content="text/html; charset=gbk" />\n    <title>【锤子M1L】锤子 M1L(SM919)4GB+32GB 白色 全网通4G手机 双卡双待 全金属边框【行情 报价 价格 评测】-京东</title>\n    <meta name="keywords" content="smartisanM1L,锤子M1L,锤子M1L报价,smartisanM1L报价"/>\n    <meta name="description" content="【锤子M1L】京东JD.COM提供锤子M1L正品行货,并包括smartisanM1L网购指南,以及锤子M1L图片、M1L参数、M1L评论、M1L心得、M1L技巧等信息,网购锤子M1L上京东,放心又轻松" />\n    <meta name="format-detection" content="telephone=no">\n    <meta http-equiv="mobile-agent" content="format=xhtml; url=//item.m.jd.com/product/3434759.html">\n    <meta http-equiv="mobile-agent" content="format=html5; url=//item.m.jd.com/product/3434759.html">\n    <meta http-equiv="X-UA-Compatible" content="IE=Edge">\n    <link rel="canonical" href="//item.jd.com/3434759.html"/>\n        <link rel="dns-prefetch" href="//misc.360buyimg.com"/>\n    <link rel="dns-prefetch" href="//static.360buyimg.com"/>\n    <link rel="dns-prefetch" href="//img10.360buyimg.c'

完整代码

import requests
>>> url = "http://item.jd.com/3434759.html"
>>> try:
    r = requests.get(url)
    r.raise_for_status()
    r.encoding = r.apparent_encoding
    print(r.text[:1000])
    return"爬取成功"
except:
    print("爬取失败")

2、爬取亚马逊网页

>>> import requests
>>> kv = {'user-agent':'Mozila/5.0'}
>>> r = requests.get("https://www.amazon.cn/gp/product/B00MEY0VWW/ref=s9_acss_bw_cg_kin_1c1_w?pf_rd_m=A1U5RCOVU0NYF2&pf_rd_s=merchandised-search-2&pf_rd_r=T5XCJNEEB6GW3DANWNW0&pf_rd_t=101&pf_rd_p=190844af-fd7e-4d63-b831-fbd5601cfa0d&pf_rd_i=116087071",headers=kv)
>>> r.status_code
200
>>> r.request.headers
{'Accept': '*/*', 'Connection': 'keep-alive', 'Accept-Encoding': 'gzip, deflate', 'user-agent': 'Mozila/5.0'}
>>> r.text[:100]
'<!doctype html><html class="a-no-js" data-19ax5a9jf="dingo"><!-- sp:feature:head-start -->\n<head><sc'
>>> r.text[1000:2000]
')}}}})(ue_csm);\n\n\n    var ue_err_chan = \'jserr-rw\';\n(function(d,e){function h(f,b){if(!(a.ec>a.mxe)&&f){a.ter.push(f);b=b||{};var c=f.logLevel||b.logLevel;c&&c!==k&&c!==m&&c!==n&&c!==p||a.ec++;c&&c!=k||a.ecf++;b.pageURL=""+(e.location?e.location.href:"");b.logLevel=c;b.attribution=f.attribution||b.attribution;a.erl.push({ex:f,info:b})}}function l(a,b,c,e,g){d.ueLogError({m:a,f:b,l:c,c:""+e,err:g,fromOnError:1,args:arguments},g?{attribution:g.attribution,logLevel:g.logLevel}:void 0);return!1}var k="FATAL",m="ERROR",n="WARN",p="DOWNGRADED",a={ec:0,ecf:0,\npec:0,ts:0,erl:[],ter:[],mxe:50,startTimer:function(){a.ts++;setInterval(function(){d.ue&&a.pec<a.ec&&d.uex("at");a.pec=a.ec},1E4)}};l.skipTrace=1;h.skipTrace=1;h.isStub=1;d.ueLogError=h;d.ue_err=a;e.onerror=l})(ue_csm,window);\n\n\nvar ue_id = \'SAS6G3Q9MD9R54H1QQJS\',\n    ue_url = \'/gp/uedata\',\n    ue_navtiming = 1,\n    ue_mid = \'AAHKV2X7AFYLW\',\n    ue_sid = \'459-2260863-3297312\',\n    ue_sn = \'www.amazon.cn\',\n    ue_furl = \'fls-cn.amazon.cn'
>>> 

完整代码

>>> import requests
>>> url = "https://www.amazon.cn/gp/product/B00MEY0VWW/ref=s9_acss_bw_cg_kin_1c1_w?pf_rd_m=A1U5RCOVU0NYF2&pf_rd_s=merchandised-search-2&pf_rd_r=T5XCJNEEB6GW3DANWNW0&pf_rd_t=101&pf_rd_p=190844af-fd7e-4d63-b831-fbd5601cfa0d&pf_rd_i=116087071"
>>> try:
    kv = {'user-agent':'Mozilla/5.0'}
    r= requests.get(url, headers=kv)
    r.raise_for_status()
    r.encoding = r.apparent_encoding
    print(r.text[1000:2000])
except:
    print("爬取失败")

3、搜索引擎关键词提交

  • 百度的关键词接口

http://www.baidu.com/s?wd=keyword

  • 360的关键词接口

http://www.so.com/s?q=keyword

百度关键词提交:

>>> import requests
>>> kv = {'wd':'python'}
>>> r = requests.get("http://www.baidu.com/s",params=kv)
>>> r.status_code
200
>>> r.request.url
'http://www.baidu.com/s?wd=python'
>>> len(r.text)
334380
>>> r.text[:1000]
'<!DOCTYPE html>\n<!--STATUS OK-->\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\t\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\n\n\n<html>\n\t<head>\n\t\t\n\t\t<meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">\n\t\t<meta http-equiv="content-type" content="text/html;charset=utf-8">\n\t\t<meta content="always" name="referrer">\n        <meta name="theme-color" content="#2932e1">\n        <link rel="shortcut icon" href="/favicon.ico" type="image/x-icon" />\n        <link rel="icon" sizes="any" mask href="//www.baidu.com/img/baidu.svg">\n        <link rel="search" type="application/opensearchdescription+xml" href="/content-search.xml" title="百度搜索" /> \n\t\t\n\t\t\n<title>python_百度搜索</title>\n\n\t\t\n\n\t\t\n<style data-for="result" type="text/css" id="css_newi_result">body{color:#333;background:#fff;padding:6px 0 0;margin:0;position:relative;min-width:900px}\nbody,th,td,.p1,.p2{font-family:arial}\np,form,ol,ul,li,dl,dt,dd,h3{margin:0;padding:0;list-style:none}\ninput{pad'
>>> 

完整代码

>>> import requests
>>> keyword = "python"
>>> try:
    kv = {'wd':keyword}
    r = requests.get("http://www.baidu.com/s",params=kv)
    print(r.request.url)
    r.raise_for_status()
    print(len(r.text))
except:
    print("爬取失败")


http://www.baidu.com/s?wd=python
322584
>>> 

360关键词提交:

>>> import requests
>>> kv = {'q':'python'}
>>> r = requests.get("http://www.so.com/s",params=kv)
>>> r.status_code
200
>>> r.request.url
'https://www.so.com/s?q=python'
>>> len(r.text)
227744
>>> print(r.text[2000:3000])
r,a.g-a-noline:hover em,.g-a-noline a:hover em{text-decoration:underline}.g-ellipsis{overflow:hidden;text-overflow:ellipsis;white-space:nowrap}.g-f-yahei{font-family:arial,"WenQuanYi Micro Hei","Microsoft YaHei",SimHei}.g-shadow{box-shadow:0 1px 1px rgba(0,0 ,0,0.06)}.g-clearfix{zoom:1}.g-card{border:1px solid #e5e5e5;font-size:13px;_zoom:1}.g-btn{border:0;border-radius:1px;box-sizing:content-box;cursor:pointer;display:inline-block;outline:none;overflow:hidden;padding:0 10px;text-align:center;text-decoration:none;vertical-align:middle}.g-btn-icon{display:inline-block;_padding-top:7px}.g-btn-green{background:#19b955;border:1px solid #19b955;color:#fff;font-size:12px;height:24px;line-height:24px}input.g-btn,button.g-btn{line-height:20px;*padding:0 5px}.g-clearfix:after{clear:both;content:'';display:block;height:0;visibility:hidden}.g-card .g-card-foot{border-top:1px solid #e5e5e5;height:36px;line-height:36px;padding:0 10px}.g-card .g-card-foot-open,.g-card .g-card-foot-close{padding:0}.g
>>> 

完整代码:

>>> import requests
>>> keyword = "python"
>>> try:
    kv = {'q':keyword}
    r = requests.get("http://www.so.com/s",params=kv)
    print(r.request.url)
    r.raise_for_status()
    print(len(r.text))
except:
    print("爬取失败")


https://www.so.com/s?q=python
215119
>>> 

4、网络图片的爬取和存储

网络图片的链接格式:
http://www.example.com/picture.jpg

一张图片的链接
http://image.baidu.com/search/detail?ct=503316480&z=undefined&tn=baiduimagedetail&ipn=d&word=%E7%88%B1%E5%A3%81%E7%BA%B8&step_word=&ie=utf-8&in=&cl=2&lm=-1&st=undefined&cs=3752912346,2452166601&os=3385445049,2037359231&simid=4144739372,697793985&pn=0&rn=1&di=83110628040&ln=1956&fr=&fmq=1492829415188_R&fm=&ic=undefined&s=undefined&se=&sme=&tab=0&width=undefined&height=undefined&face=undefined&is=0,0&istype=0&ist=&jit=&bdtype=0&spn=0&pi=0&gsm=0&hs=2&objurl=http%3A%2F%2Fh5.86.cc%2Fwalls%2F20150106%2F1440x900_b3cf5a29601634a.jpg&rpstart=0&rpnum=0&adpicid=0

>>> import requests
>>> path = "D:abc.jpg"
>>> url = "http://image.baidu.com/search/detail?ct=503316480&z=undefined&tn=baiduimagedetail&ipn=d&word=%E7%88%B1%E5%A3%81%E7%BA%B8&step_word=&ie=utf-8&in=&cl=2&lm=-1&st=undefined&cs=3752912346,2452166601&os=3385445049,2037359231&simid=4144739372,697793985&pn=0&rn=1&di=83110628040&ln=1956&fr=&fmq=1492829415188_R&fm=&ic=undefined&s=undefined&se=&sme=&tab=0&width=undefined&height=undefined&face=undefined&is=0,0&istype=0&ist=&jit=&bdtype=0&spn=0&pi=0&gsm=0&hs=2&objurl=http%3A%2F%2Fh5.86.cc%2Fwalls%2F20150106%2F1440x900_b3cf5a29601634a.jpg&rpstart=0&rpnum=0&adpicid=0"
>>> r = requests.get("
KeyboardInterrupt
>>> r = requests.get("http://image.baidu.com/search/detail?ct=503316480&z=undefined&tn=baiduimagedetail&ipn=d&word=%E7%88%B1%E5%A3%81%E7%BA%B8&step_word=&ie=utf-8&in=&cl=2&lm=-1&st=undefined&cs=3752912346,2452166601&os=3385445049,2037359231&simid=4144739372,697793985&pn=0&rn=1&di=83110628040&ln=1956&fr=&fmq=1492829415188_R&fm=&ic=undefined&s=undefined&se=&sme=&tab=0&width=undefined&height=undefined&face=undefined&is=0,0&istype=0&ist=&jit=&bdtype=0&spn=0&pi=0&gsm=0&hs=2&objurl=http%3A%2F%2Fh5.86.cc%2Fwalls%2F20150106%2F1440x900_b3cf5a29601634a.jpg&rpstart=0&rpnum=0&adpicid=0")
>>> r.status_code
200
>>> with open(path,'wb') as f:
    f.write(r.content)


118680
>>> f.close()
>>> 

5、Ip地址归属地查询

查询网址的接口
http://m.ip138.com/ip.asp?ip=ipaddress

>>> import requests
>>> url = 'http://m.ip138.com/ip.asp?ip='
>>> r = requests.get(url+'116.7.245.184')
>>> r.status_code
200
>>> r.text[-500:]
'"submit" value="查询" class="form-btn" />\r\n\t\t\t\t\t</form>\r\n\t\t\t\t</div>\r\n\t\t\t\t<div class="query-hd">ip138.com IP查询(搜索IP地址的地理位置)</div>\r\n\t\t\t\t<h1 class="query">您查询的IP:116.7.245.184</h1><p class="result">本站主数据:广东省深圳市  电信</p><p class="result">参考数据一:广东省广州市 电信</p>\r\n\r\n\t\t\t</div>\r\n\t\t</div>\r\n\r\n\t\t<div class="footer">\r\n\t\t\t<a href="http://www.miitbeian.gov.cn/" rel="nofollow" target="_blank">沪ICP备10013467号-1</a>\r\n\t\t</div>\r\n\t</div>\r\n\r\n\t<script type="text/javascript" src="/script/common.js"></script></body>\r\n</html>\r\n'
>>> 

完整代码:

>>> import requests
>>> url = 'http://m.ip138.com/ip.asp?ip='
>>> try:
    r = requests.get('http://m.ip138.com/ip.asp?ip='+'116.7.245.184')
    r.raise_for_status()
    r.encoding = r.apparent_encoding
    print(r.text[-500:]
except:
    print("爬取失败")
>>>
阅读更多
换一批

没有更多推荐了,返回首页