//jsoup_1.7.2, groovy_2.3.7
import org.jsoup.Jsoup
import org.jsoup.nodes.Document
import org.jsoup.select.Elements
import java.net.*
Document doc = Jsoup.connect("http://stock.591hx.com/article/2014-12-03/0000850005s.shtml").get()
Elements img = doc.select("img")
img.each{
//println(it.attr("src"))
def src = it.attr("src")
def dir = "./img"
def file = new File(dir)
if(!file.exists()){
file.mkdir()
}
if( src ==~ /.*\/hnimg\/201412\/03.*\.jpg/ ){
//println "yes"
DownLoadPic(src, dir)
}
}
def DownLoadPic(String sUrl, String dir){
URL url = new URL(sUrl)
def name = sUrl.split('/')[-1]
URLConnection uc = url.openConnection()
InputStream is = uc.getInputStream()
File file = new File(dir + '/' + name)
FileOutputStream out = new FileOutputStream(file);
int i=0
while ((i=is.read())!=-1) {
out.write(i)
}
is.close();
}
Golang 版本: 利用 管道技术 和 goroutines
package main
import (
"io/ioutil"
"log"
"net/http"
"os"
"regexp"
//"strconv"
"strings"
"sync"
)
var urlist = [...]string{"http://stock.591hx.com/article/2014-12-03/0000850005s.shtml"}
var album chan string
var w sync.WaitGroup
var dir string
func main() {
dir = "tmp_chenjo/"
err := os.Mkdir(dir, 0777)
if err != nil {
isexist := os.IsExist(err)
log.Println(isexist)
}
album = make(chan string, 200)
for _, v := range urlist {
w.Add(1)
go GetAlbum(v)
w.Wait()
}
}
func GetAlbum(url string) {
data := GetUrl(url)
body := string(data)
//<img src="http://stock.591hx.com/images/hnimg/201412/03/64/13418266510200941552.jpg" alt="" /></p>
part := regexp.MustCompile(`<img src="(.*)" alt="" />`)
match := part.FindAllStringSubmatch(body, -1)
for _, v := range match {
if m, _ := regexp.MatchString(`.*/hnimg/201412/03/.*\.jpg`, v[1]); !m {
continue
}
//println(v[1])
album <- v[1]
w.Add(1)
go GetItem()
}
w.Done()
}
func GetItem() {
url := <-album
println(url)
defer func() {
ret := recover()
if ret != nil {
log.Println(ret)
w.Done()
} else {
w.Done()
}
}()
//data := GetUrl(url)
//if len(data) > 10 {
//body := string(data)
//part := regexp.MustCompile(`bigimgsrc="(.*)"`)
//match := part.FindAllStringSubmatch(body, -1)
//for _, v := range match {
str := strings.Split(url, "/")
length := len(str)
source := GetUrl(url)
name := str[length-1]
file, err := os.Create(dir + name)
if err != nil {
panic(err)
}
size, err := file.Write(source)
defer file.Close()
if err != nil {
panic(err)
}
log.Println(size)
//}
//}
}
func GetUrl(url string) []byte {
ret, err := http.Get(url)
if err != nil {
log.Println(url)
status := map[string]string{}
status["status"] = "400"
status["url"] = url
panic(status)
}
body := ret.Body
data, _ := ioutil.ReadAll(body)
return data
}