package main
import (
"bufio"
"fmt"
"github.com/antchfx/htmlquery"
"github.com/gocolly/colly"
"github.com/gocolly/colly/extensions"
"io/ioutil"
"log"
"net/http"
"os"
"regexp"
"strings"
"sync"
"time"
)
var wg sync.WaitGroup
var ch chan int
func main() {
ch = make(chan int ,10)
var reNotAllow = `http://www.uidzhx.com/du/.*.html`
c := colly.NewCollector(
colly.AllowedDomains("www.uidzhx.com"),
colly.AllowURLRevisit(),
colly.IgnoreRobotsTxt(),
colly.DisallowedURLFilters(regexp.MustCompile(reNotAllow)),
)
c.AllowURLRevisit = false
c.Async = false
extensions.RandomUserAgent(c)
extensions.Referer(c)
c.OnRequest(func(r *colly.Request) {
fmt.Println("Visiting", r.URL.String())
})
c.Limit(&colly.LimitRule{
DomainGlob: "*",
//Parallelism: 2,
RandomDelay: 1 * time.Second,
})
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
link := e.Attr("href")
c.Visit(e.Request.AbsoluteURL(link))
})
//收到响应后
c.OnResponse(func(r *colly.Response) {
doc, err := htmlquery.Parse(strings.NewReader(string(r.Body)))
if err != nil {
log.Fatal(err)
}
title := htmlquery.FindOne(doc, `/html/body/div[4]/div[2]/div[1]/div/div[2]/div/h1`)
if title != nil{
var reTxt = `http://dzs.uidzhx.com.*\.txt`
re := regexp.MustCompile(reTxt)
url := re.FindString(string(r.Body))
if url != "" {
txtTitle := strings.Replace(htmlquery.InnerText(title), " ", "", -1)
fmt.Println(txtTitle)
wg.Add(1)
ch <- 1
//go saveTxt(txtTitle,url)
}
}
})
c.Visit("http://www.uidzhx.com/Shtml89401.html")
wg.Wait()
}
func saveTxt(title string,url string ) {
defer wg.Done()
str := download(url)
fmt.Println(str)
fmt.Printf("save txt %s - %s\n",title,url)
filePath := "d:/crawl/"+title+".txt"
file, err := os.OpenFile(filePath, os.O_WRONLY | os.O_CREATE, 0666)
if err != nil {
fmt.Printf("open file err=%v\n", err)
return
}
//及时关闭file句柄
defer file.Close()
//写入时,使用带缓存的 *Writer
writer := bufio.NewWriter(file)
for i := 0; i < 5; i++ {
writer.WriteString(str)
}
<- ch
}
func download(url string) string {
client := &http.Client{}
req,_ := http.NewRequest("GET",url,nil)
req.Header.Set("User-Agent","Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)")
resp,err := client.Do(req)
if err != nil{
fmt.Print("http get err",err)
panic("http get err")
}
defer resp.Body.Close()
body,err := ioutil.ReadAll(resp.Body)
if err != nil{
fmt.Print("read error ",err)
panic("read error")
}
return string(body)
}
|
请发表评论