urllib爬取数据:
import urllib #urllib爬虫模块 import re #正则模块 import time # def get = GetHtml(url): page = urlib.urlopen(url) html = page.read() return html def getImg(html): # imglist = re.findall(r'src="(.*?\.jpg)" ',html) #返回为数组 imglist = re.findall(r'"src=”(.*?\.(jpg|png)"',html) #返回为元组 x= 0 for imgurl in imglist: print('正在下载%s'imgurl[0]) urllib.urlretrieve(imgurl[0],'./dowmloads/%d.jpg'%x) x += 1 getImg(getHtml("url地址")) #网址url变化 urlencode params = urllib.urlencode({'t':'b','w’:’word’}) #t=b&w=word f = urllib.urlopen('url地址%s' % params) #GET方法:urlopen(url) f = urllib.urlopen("url地址",parmas) #POST方法:urlopen(url,urlencode({ "k":"b" }))
urllib2爬取数据:伪装成客户端request,服务器不会认定为爬虫
#抓取网页内容-发送报头 import urllib2 import sys url = "" send_headers = { #Request Headers 浏览器请求报文 'Host’ : ‘’, 'User-Agent’: ‘’, 'Accept’:‘text/html’, 'Connection’:’keep-alive’ } req = urllib2.Resquest(url,headers = send_headers) r = urllibs2.urlopen(req)
BeautifulSoup解析数据:
import bs4 from bs4 import BeautifulSoup #导入模块 soup = BeatuifulSoup(html) #创建bs对象 soup.prettify() #格式化html标签 soup.标签名 #找标签 soup.标签.属性attrs/name #按标签属性、名称查找 List = soup.select(标签/.class/#id/属性) #选择器 List = soup.select("标签[id/class=value") #组合选择器 List[index].string #获取文字 List[index]['属性’] #获取属性