爬虫笔记

最新推荐文章于 2019-03-27 01:20:04 发布

超超级钢铁侠

最新推荐文章于 2019-03-27 01:20:04 发布

阅读量376

点赞数 1

文章标签：爬虫

本文链接：https://blog.csdn.net/qq_23851075/article/details/78893317

版权

Python 专栏收录该内容

4 篇文章 0 订阅

订阅专栏

基本操作

#简单爬百度主页
import urllib.request
file = urllib.request.urlopen("http://www.baidu.com")
data = file.read()
print(data)

#存到本地文件
fhandle = open("C:/MyCode/Python/Python3/Scratch/1.html","wb")
fhandle.write(data)
fhandle.close()

#更便捷的方式
filename = urllib.request.urlretrieve("http://edu.51cto.com",filename = "C:/MyCode/Python/Python3/Scratch/2.html")
urllib.request.urlcleanup()

#获取file信息
file.info()
file.getcode()
file.geturl()

#网址编码与解码
urllib.request.quote("http://www.sina.com.cn")
urllib.request.unquote("http%3A//www.sina.com.cn")

#模拟百度搜索
keywd = "hello"
url = "http://www.baidu.com/s?wd=" + keywd
req = urllib.request.Request(url)
data = urllib.request.urlopen(req).read()
fhandle = open("C:/MyCode/Python/Python3/Scratch/hello.html","wb")
fhandle.write(data)
fhandle.close()

#模拟百度中文搜索
keywd = "超超级钢铁侠"
key_code = urllib.request.quote(keywd)
url = "http://www.baidu.com/s?wd=" + key_code
req = urllib.request.Request(url)
data = urllib.request.urlopen(req).read()
fhandle = open("C:/MyCode/Python/Python3/Scratch/super.html","wb")
fhandle.write(data)
fhandle.close()

#进行post请求
import urllib.parse
url = "http://www.iqianyue.com/mypost"
postdata = urllib.parse.urlencode({
    "name":"ceo@iqianyue.com",
    "pass":"aA123456"
}).encode('utf-8')
req = urllib.request.Request(url,postdata)
req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:58.0) Gecko/20100101 Firefox/58.0')
data = urllib.request.urlopen(req).read()
fhandle = open("C:/MyCode/Python/Python3/Scratch/post.html","wb")
fhandle.write(data)
fhandle.close()

#开启DebugLog
httphd = urllib.request.HTTPHandler(debuglevel=1)
httpshd = urllib.request.HTTPSHandler(debuglevel=1)
opener = urllib.request.build_opener(httphd,httpshd)
urllib.request.install_opener(opener)
data = urllib.request.urlopen("http://edu.51cto.com")

#异常处理
import urllib.error
try:
    urllib.request.urlopen("https://blog.baidusss.net/")
except urllib.error.URLError as e:
    print(e.code)
    print(e.reason)

爬取页面链接

#抓取页面链接
import re
import urllib.request
def getlink(url):
    #模拟成浏览器
    headers=("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0")
    opener = urllib.request.build_opener()
    opener.addheaders = [headers]
    #将opener安装为全局
    urllib.request.install_opener(opener)
    file=urllib.request.urlopen(url)
    data=str(file.read())
    #根据需求构建好链接表达式
    pat='(https?://[^\s)";]+\.(\w|/)*)'
    link=re.compile(pat).findall(data)
    #去除重复元素
    link=list(set(link))
    return link
#要爬取的网页链接
url="https://search.jd.com/Search?keyword=shouji&enc=utf-8&wq=shouji&pvid=3f2c0aa481834822ac2b8ec15dde1ae6"
#获取对应网页中包含的链接地址
linklist=getlink(url)
#通过for循环分别遍历输出获取到的链接地址到屏幕上
for link in linklist:
    print(link[0])