Golang网页下载示例

2018-07-20    来源:open-open

容器云强势上线!快速搭建集群,上万Linux镜像随意使用
package main
 
/*
 * 中文编码问题
 */
 
import (
    "errors"
    "flag"
    "fmt"
    query "github.com/PuerkitoBio/goquery"
    "golang.org/x/text/encoding/simplifiedchinese"
    "io/ioutil"
    "net/http"
    "os"
    "path/filepath"
    "runtime"
    "strings"
    "sync"
)
 
 
var (
    np = runtime.NumCPU()
    _  = runtime.GOMAXPROCS(np)
)
 
var wg sync.WaitGroup
 
type Folder struct {
    Url string
    Dir string
}
 
type File struct {
    Url  string
    Dir  string
    Name string
}
 
 
func checkErr(err error) {
    if err != nil {
        fmt.Printf("%v\n", err.Error())
        os.Exit(1)
    }
}
 
 
func decodeToGBK(text string) (string, error) {
    dst := make([]byte, len(text)*2)
    tr := simplifiedchinese.GB18030.NewDecoder()
    nDst, _, err := tr.Transform(dst, []byte(text), true)
    if err != nil {
        return text, err
    }
    return string(dst[:nDst]), nil
}
 
 
func printEach(index int, item *query.Selection) {
    fmt.Println("Selection: ", item.Text())
}
 
 
func isDir(path string) bool {
    return strings.HasSuffix(path, "/")
}
 
 
func makeFolder(item *query.Selection, url, dir string) (f *Folder, err error) {
    tx := item.Text()
    href, ok := item.Attr("href")
    name, err := decodeToGBK(tx)
    if err != nil {
        return
    }
    if !ok {
        err = errors.New("makeFolder : " + tx + " href属性不存在")
        return
    }
    f = &Folder{Url: url + href, Dir: filepath.Join(dir, name)}
    return
}
 
 
func makeFile(item *query.Selection, url, dir string) (f *File, err error) {
    tx := item.Text()
    href, ok := item.Attr("href")
    if !ok {
        err = errors.New("makeFile : " + tx + " href属性不存在")
        return
    }
    name, err := decodeToGBK(tx)
    if err != nil {
        return
    }
    f = &File{Url: url + href, Dir: dir, Name: name}
    return
}
 
 
func crawl(url, localDir string) {
    doc, err := query.NewDocument(url)
    // checkErr(err)
    if err != nil {
        fmt.Printf("%v\n", err.Error())
        return
    }
 
    items := doc.Find("a")
 
    dir := localDir
 
    if !strings.HasSuffix(url, "/") {
        url += "/"
    }
 
    crawlEach := func(i int, item *query.Selection) {
        tx := item.Text()
        if isDir(tx) {
            folder, err := makeFolder(item, url, dir)
            if err != nil {
                fmt.Printf("%v\n", err.Error())
                return
            }
            wg.Add(1)
            go crawlFolder(folder)
        } else {
            file, err := makeFile(item, url, dir)
            if err != nil {
                fmt.Printf("%v\n", err.Error())
                return
            }
            download(file)
        }
    }
 
    items.Each(crawlEach)
}
 
 
func download(file *File) {
    dir := file.Dir
    url := file.Url
    name := file.Name
 
    if err := os.MkdirAll(dir, os.ModePerm); os.IsExist(err) {
        fmt.Printf("%x is exist\n", dir)
    } else {
        os.Chmod(dir, os.ModePerm)
    }
    resp, err := http.Get(url)
    if err != nil {
        fmt.Printf("%v\n", err.Error())
        return
    }
    defer resp.Body.Close()
 
    body, err := ioutil.ReadAll(resp.Body)
    if err != nil {
        fmt.Printf("%v\n", err.Error())
        return
    }
 
    fp := string([]rune(filepath.Join(dir, name)))
 
    err = ioutil.WriteFile(fp, body, 0777)
    if err != nil {
        fmt.Printf("%v fp:[%v]\n", err.Error(), fp)
        return
    }
    fmt.Printf("Download: %+v\n", file)
}
 
 
func crawlFolder(folder *Folder) {
    url := folder.Url
    dir := folder.Dir
 
    crawl(url, dir)
    wg.Done()
}
 
 
func main() {
    host := flag.String("host", "http://localhost:8000", "HTTP服务地址Host")
    location := flag.String("locate", "E:/Crawler下载文件", "本地文件系统绝对路径")
    flag.Parse()
    crawl(*host, *location)
    wg.Wait()
}

标签:

版权申明:本站文章部分自网络,如有侵权,请联系:west999com@outlook.com
特别注意:本站所有转载文章言论不代表本站观点!
本站所提供的图片等素材,版权归原作者所有,如需使用,请与原作者联系。

上一篇:HttpClient通过GET和POST获取网页内容

下一篇:Java 使用 NIO 进行文件合并输出