在线时间:8:00-16:00
迪恩网络APP
随时随地掌握行业动态
扫描二维码
关注迪恩网络微信公众号
上次用Scala写了个爬虫。最近在闲工夫之时,学习Go语言,便用Go移植了那个用Scala写的爬虫,代码如下: package main import ( "fmt" "io/ioutil" "net/http" "regexp" ) var ( ptnIndexItem = regexp.MustCompile(`<a target="_blank" href="(.+\.html)" title=".+" >(.+)</a>`) ptnContentRough = regexp.MustCompile(`(?s).*<div class="artcontent">(.*)<div >.*`) ptnBrTag = regexp.MustCompile(`<br>`) ptnHTMLTag = regexp.MustCompile(`(?s)</?.*?>`) ptnSpace = regexp.MustCompile(`(^\s+)|( )`) ) func Get(url string) (content string, statusCode int) { resp, err1 := http.Get(url) if err1 != nil { statusCode = -100 return } defer resp.Body.Close() data, err2 := ioutil.ReadAll(resp.Body) if err2 != nil { statusCode = -200 return } statusCode = resp.StatusCode content = string(data) return } type IndexItem struct { url string title string } func findIndex(content string) (index []IndexItem, err error) { matches := ptnIndexItem.FindAllStringSubmatch(content, 10000) index = make([]IndexItem, len(matches)) for i, item := range matches { index[i] = IndexItem{"http://www.yifan100.com" + item[1], item[2]} } return } func readContent(url string) (content string) { raw, statusCode := Get(url) if statusCode != 200 { fmt.Print("Fail to get the raw data from", url, "\n") return } match := ptnContentRough.FindStringSubmatch(raw) if match != nil { content = match[1] } else { return } content = ptnBrTag.ReplaceAllString(content, "\r\n") content = ptnHTMLTag.ReplaceAllString(content, "") content = ptnSpace.ReplaceAllString(content, "") return } func main() { fmt.Println(`Get index ...`) s, statusCode := Get("http://www.yifan100.com/dir/15136/") if statusCode != 200 { return } index, _ := findIndex(s) fmt.Println(`Get contents and write to file ...`) for _, item := range index { fmt.Printf("Get content %s from %s and write to file.\n", item.title, item.url) fileName := fmt.Sprintf("%s.txt", item.title) content := readContent(item.url) ioutil.WriteFile(fileName, []byte(content), 0644) fmt.Printf("Finish writing to %s.\n", fileName) } } 代码行数比Scala版的有一定增加,主要原因有以下几方面原因: 当然golang版的爬虫也有一个优势,就是编译速度很快,执行速度在现在的写法里面体现不出优势;golang的特性goroutine在这里没有用到,这段代码今后会不断改进。 |
请发表评论