Go语言爬虫1-网络请求

原作者: [db:作者] 来自: [db:来源] 收藏邀请

下面是找的几个例子：

例子1：获得百度首页的html源文件：

package main

import(

    "fmt"

    "io/ioutil"

    "net/http"

func main(){

    response,_:=http.Get("http://www.baidu.com")

    defer response.Body.Close()

    body,_:=ioutil.ReadAll(response.Body)

    fmt.Println(string(body))

例子2，增加了一些错误验证

package main

import(

    "fmt"

    "io/ioutil"

    "net/http"

    "os"

func main(){

    response,err:=http.Get("http://www.baidu.com/")

    if err!=nil{

        fmt.Printf("%s",err)

        os.Exit(1)

    }else{

        defer response.Body.Close()

        contents,err:=ioutil.ReadAll(response.Body)

        if err!=nil{

            fmt.Printf("%s",err)

            os.Exit(1)

        fmt.Printf("%s\n",string(contents))

http下有Get，Post，PostForm三个函数。这三个函数直接实现了简单的http客户端

package main

import(

    "fmt"

    "io/ioutil"

    "log"

    "net/http"

func main(){

    res,err:=http.Get("http://www.ghj1976.net/")

    if err!=nil{

        log.Fatal(err)

    defer res.Body.Close()

    robots,err:=ioutil.ReadAll(res.Body)

    if err!=nil{

        log.Fatal(err)

    fmt.Printf("%s",robots)

例子3：把百度的网页存在本地一个文件：

package main

import(

    "fmt"

    "log"

    "net/http"

    "os"

func main(){

    resp,err:=http.Get("http://www.baidu.com")

    if err!=nil{

        //handleerror

        fmt.Println(err)

        log.Fatal(err)

    defer resp.Body.Close()

    if resp.StatusCode==http.StatusOK{

        fmt.Println(resp.StatusCode)

    buf:=make([]byte,1024)

    //createfile

    f,err1:=os.OpenFile("baidu.html",os.O_RDWR|os.O_CREATE|os.O_APPEND,os.ModePerm)

    if err1!=nil{

        panic(err1)

        return

    defer f.Close()

    for{

        n,_:=resp.Body.Read(buf)

        if 0==n{

            break

        f.WriteString(string(buf[:n]))

其他可以借鉴的

golang 批量检查页面

http://www.simonzhang.net/?p=1346

除了使用Get、Post、PostForm 这三个函数来建立一个简单客户端，还可以使用：
http.Client和http.NewRequest来模拟请求

例子：指定公共头的请求百度页面

package main

import(

    "fmt"

    "io/ioutil"

    "net/http"

func main(){

    client:=&http.Client{}

    reqest,_:=http.NewRequest("GET","http://www.baidu.com",nil)

    reqest.Header.Set("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")

    reqest.Header.Set("Accept-Charset","GBK,utf-8;q=0.7,*;q=0.3")

    reqest.Header.Set("Accept-Encoding","gzip,deflate,sdch")

    reqest.Header.Set("Accept-Language","zh-CN,zh;q=0.8")

    reqest.Header.Set("Cache-Control","max-age=0")

    reqest.Header.Set("Connection","keep-alive")

    response,_:=client.Do(reqest)

    if response.StatusCode==200{

        body,_:=ioutil.ReadAll(response.Body)

        bodystr:=string(body)

        fmt.Println(bodystr)

参考资料：

用golang的正则regexp：去除HTML，CSS，SCRIPT代码，仅保留页面文字
http://bpbp.iteye.com/blog/1668869

该文章已有0人参与评论

专题导读

【转】基于 Go Micro 框架构建一个简单的微服务接口发布时间：2022-07-10

GO语言从入门到放弃目录发布时间：2022-07-10