基本操作
import urllib.request
file = urllib.request.urlopen("http://www.baidu.com")
data = file.read()
print(data)
fhandle = open("C:/MyCode/Python/Python3/Scratch/1.html","wb")
fhandle.write(data)
fhandle.close()
filename = urllib.request.urlretrieve("http://edu.51cto.com",filename = "C:/MyCode/Python/Python3/Scratch/2.html")
urllib.request.urlcleanup()
file.info()
file.getcode()
file.geturl()
urllib.request.quote("http://www.sina.com.cn")
urllib.request.unquote("http%3A//www.sina.com.cn")
keywd = "hello"
url = "http://www.baidu.com/s?wd=" + keywd
req = urllib.request.Request(url)
data = urllib.request.urlopen(req).read()
fhandle = open("C:/MyCode/Python/Python3/Scratch/hello.html","wb")
fhandle.write(data)
fhandle.close()
keywd = "超超级钢铁侠"
key_code = urllib.request.quote(keywd)
url = "http://www.baidu.com/s?wd=" + key_code
req = urllib.request.Request(url)
data = urllib.request.urlopen(req).read()
fhandle = open("C:/MyCode/Python/Python3/Scratch/super.html","wb")
fhandle.write(data)
fhandle.close()
import urllib.parse
url = "http://www.iqianyue.com/mypost"
postdata = urllib.parse.urlencode({
"name":"ceo@iqianyue.com",
"pass":"aA123456"
}).encode('utf-8')
req = urllib.request.Request(url,postdata)
req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:58.0) Gecko/20100101 Firefox/58.0')
data = urllib.request.urlopen(req).read()
fhandle = open("C:/MyCode/Python/Python3/Scratch/post.html","wb")
fhandle.write(data)
fhandle.close()
httphd = urllib.request.HTTPHandler(debuglevel=1)
httpshd = urllib.request.HTTPSHandler(debuglevel=1)
opener = urllib.request.build_opener(httphd,httpshd)
urllib.request.install_opener(opener)
data = urllib.request.urlopen("http://edu.51cto.com")
import urllib.error
try:
urllib.request.urlopen("https://blog.baidusss.net/")
except urllib.error.URLError as e:
print(e.code)
print(e.reason)
爬取页面链接
import re
import urllib.request
def getlink(url):
headers=("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0")
opener = urllib.request.build_opener()
opener.addheaders = [headers]
urllib.request.install_opener(opener)
file=urllib.request.urlopen(url)
data=str(file.read())
pat='(https?://[^\s)";]+\.(\w|/)*)'
link=re.compile(pat).findall(data)
link=list(set(link))
return link
url="https://search.jd.com/Search?keyword=shouji&enc=utf-8&wq=shouji&pvid=3f2c0aa481834822ac2b8ec15dde1ae6"
linklist=getlink(url)
for link in linklist:
print(link[0])