所用的python为python3.x,与python2.x在一些库的调用上有些许不同。
首先我们需要知道几个库 urlib ,urlib.request 这是网络请求的库
1.最简单的爬虫
爬取百度
import urllib
import urllib.request
# 爬虫百度,根据关键词进行爬取
data = {}
data['word'] = '火影'
url_values = urllib.parse.urlencode(data)
url = "http://www.baidu.com/s?"
full_url = url+url_values
data = urllib.request.urlopen(full_url).read()
data = data.decode('UTF-8')
print(data)
同样的道理我们爬取京东 搜索商品信息
import urllib
import urllib.request
import re
#爬取京东商城的指定商品
data = {}
data['keyword'] = '被子'
url_valus = urllib.parse.urlencode(data)
url = "http://search.jd.com/Search?"
fullurl = url+url_valus+'&enc=utf-8'
print(fullurl)
d = urllib.request.urlopen(fullurl).read()
d = d.decode('utf-8')
print(d)
f = open('e:/test.txt', 'w')
src = 'title="(.*?)"'
s = re.findall(src, d)
for m in s:
print(m)
c = d.encode('gbk','ignore') # 先解码 然后编码
c = c.decode('gbk','ignore')
f.write(str(c))
f.close()
由于写入文本,需要是GBK编码。而原网页是Utf-8所以要先解码 再编码
2 下载网页上的图片
#python3.4 爬虫教程
#爬取网站上的图片
import urllib.request
import socket
import re
import sys
import os
targetDir = "E:\\123" #文件保存路径
def destFile(path):
if not os.path.isdir(targetDir):
os.mkdir(targetDir)
pos = path.rindex('/')
t = os.path.join(targetDir, path[pos+1:])
return t
weburl = 'http://www.douban.com/'
webheaders = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
req = urllib.request.Request(url=weburl, headers=webheaders) #构造请求报头
webpage = urllib.request.urlopen(req) #发送请求报头
contentBytes = webpage.read()
##print(contentBytes.decode('utf-8'))
content = contentBytes.decode('utf-8')
pattern=re.compile('<img.*?src="(.*?)"',re.S)//正则表达式过滤图片网址信息
items = re.findall(pattern,content)
for itme in items:
print(itme)
urllib.request.urlretrieve(itme,destFile(itme))
3 需要登陆的网页,以我们学校的信息门户网站为例
需要提交数据。
import urllib.request
import re
import http.cookiejar
class USTC:
def __init__(self):
self.loginurl="http://portal.uestc.edu.cn/"
self.header={
'Connection': 'Keep-Alive',
'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
}
self.postDict = {'username': 'xxxxxxxx',
'password': '<span style="font-family: Arial, Helvetica, sans-serif;">xxxxxxxx</span>
',
}
def getOpener(self,head):
cj=http.cookiejar.CookieJar()
pro=urllib.request.HTTPCookieProcessor(cj)
opener = urllib.request.build_opener(pro)
header = []
for key, value in head.items():
elem = (key, value)
header.append(elem)
opener.addheaders = header
return opener
def getdata(self):
opener= self.getOpener(header)
postData = urllib.parse.urlencode(self.postDict).encode('utf-8')
print(postData)
op = opener.open(self.loginurl, postData)
data = op.read()
print(data.decode())
header = {'Content-Type':'application/x-www-form-urlencoded',
'Host':'idas.uestc.edu.cn',
'Origin':'http://idas.uestc.edu.cn'
'Referer'':http://idas.uestc.edu.cn/authserver/login?service=http%3A%2F%2Fportal.uestc.edu.cn%2F',
'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36'
}
sdu = USTC()
sdu.getdata()
当爬取网页后,得到网页信息,要想得到指定项目的信息,需要用正则表达式进行过滤。