在线时间:8:00-16:00
迪恩网络APP
随时随地掌握行业动态
扫描二维码
关注迪恩网络微信公众号
基于网站seo,做了一采集百度和Google搜索关键字结果的采集.在这里与大家分享一下 先看先效果图 代码附加: View Code
1 private void baidu_Click(object sender, EventArgs e)
2 { 3 int num = 100;//搜索条数 4 string url = "http://www.baidu.com/s?wd=" + txtSearch.Text.Trim() + "&rn=" + num + ""; 5 string html = search(url, "gb2312"); 6 BaiduSearch baidu = new BaiduSearch(); 7 if (!string.IsNullOrEmpty(html)) 8 { 9 int count = baidu.GetSearchCount(html);//搜索条数 10 if (count > 0) 11 { 12 List<Keyword> keywords = baidu.GetKeywords(html, txtSearch.Text.Trim()); 13 dataGridView1.DataSource = keywords; 14 } 15 16 } 17 } 18 19 private void google_Click(object sender, EventArgs e) 20 { 21 int num = 100; 22 string url = "http://www.google.com.hk/search?hl=zh-CN&source=hp&q=" + txtSearch.Text.Trim() + "&aq=f&aqi=&aql=&oq=&num=" + num + ""; 23 string html = search(url, "utf-8"); 24 if (!string.IsNullOrEmpty(html)) 25 { 26 27 googleSearch google = new googleSearch(); 28 List<Keyword> keywords = google.GetKeywords(html, txtSearch.Text.Trim()); 29 dataGridView1.DataSource = keywords; 30 31 } 32 } 33 /// <summary> 34 /// 搜索处理 35 /// </summary> 36 /// <param name="url">搜索网址</param> 37 /// <param name="Chareset">编码</param> 38 public string search(string url, string Chareset) 39 { 40 HttpState result = new HttpState(); 41 Uri uri = new Uri(url); 42 HttpWebRequest myHttpWebRequest = (HttpWebRequest)WebRequest.Create(url); 43 myHttpWebRequest.UseDefaultCredentials = true; 44 myHttpWebRequest.ContentType = "text/html"; 45 myHttpWebRequest.UserAgent = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.0; .NET CLR 1.1.4322; .NET CLR 2.0.50215;)"; 46 myHttpWebRequest.Method = "GET"; 47 myHttpWebRequest.CookieContainer = new CookieContainer(); 48 49 try 50 { 51 HttpWebResponse response = (HttpWebResponse)myHttpWebRequest.GetResponse(); 52 // 从 ResponseStream 中读取HTML源码并格式化 add by cqp 53 result.Html = readResponseStream(response, Chareset); 54 result.CookieContainer = myHttpWebRequest.CookieContainer; 55 return result.Html; 56 } 57 catch (Exception ex) 58 { 59 return ex.ToString(); 60 } 61 62 } 63 public string readResponseStream(HttpWebResponse response, string Chareset) 64 { 65 string result = ""; 66 using (StreamReader responseReader = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding(Chareset))) 67 { 68 result = formatHTML(responseReader.ReadToEnd()); 69 } 70 71 return result; 72 } 73 /// <summary> 74 /// 描述:格式化网页源码 75 /// 76 /// </summary> 77 /// <param name="htmlContent"></param> 78 /// <returns></returns> 79 public string formatHTML(string htmlContent) 80 { 81 string result = ""; 82 83 result = htmlContent.Replace("»", "").Replace(" ", "") 84 .Replace("©", "").Replace("/r", "").Replace("/t", "") 85 .Replace("/n", "").Replace("&", "&"); 86 return result; 87 把百度和Google两个类抽取了出来 1.百度Search类 View Code
1 class BaiduSearch
2 { 3 protected string uri = "http://www.baidu.com/s?wd="; 4 protected Encoding queryEncoding = Encoding.GetEncoding("gb2312"); 5 protected Encoding pageEncoding = Encoding.GetEncoding("gb2312"); 6 protected string resultPattern = @"(?<=找到相关结果[约]?)[0-9,]*?(?=个)"; 7 public int GetSearchCount(string html) 8 { 9 int result = 0; 10 string searchcount = string.Empty; 11 12 Regex regex = new Regex(resultPattern); 13 Match match = regex.Match(html); 14 15 if (match.Success) 16 { 17 searchcount = match.Value; 18 } 19 else 20 { 21 searchcount = "0"; 22 } 23 24 if (searchcount.IndexOf(",") > 0) 25 { 26 searchcount = searchcount.Replace(",", string.Empty); 27 } 28 29 int.TryParse(searchcount, out result); 30 31 return result; 32 } 33 34 public List<Keyword> GetKeywords(string html, string word) 35 { 36 int i = 1; 37 List<Keyword> keywords = new List<Keyword>(); 38 string ss="<h3 class=\"t\"><a.*?href=\"(?<url>.*?)\".*?>(?<content>.*?)</a>"; 39 MatchCollection mcTable = Regex.Matches(html,ss); 40 foreach (Match mTable in mcTable) 41 { 42 if (mTable.Success) 43 { 44 Keyword keyword = new Keyword(); 45 keyword.ID = i++; 46 keyword.Title = Regex.Replace(mTable.Groups["content"].Value, "<[^>]*>", string.Empty); 47 keyword.Link = mTable.Groups["url"].Value; 48 keywords.Add(keyword); 49 50 } 51 } 52 53 return keywords; 54 } 55 2 .GoogleSearch类 View Code
1 class googleSearch
2 { 3 4 public List<Keyword> GetKeywords(string html, string word) 5 { 6 int i = 1; 7 List<Keyword> keywords = new List<Keyword>(); 8 9 Regex regTable = new Regex("<h3 class=\"r\"><a.*?href=\"(?<url>.*?)\".*?>(?<content>.*?)</a>", RegexOptions.IgnoreCase); 10 Regex regA = new Regex(@"(?is)<a/b[^>]*?href=(['""]?)(?<link>[^'""/s>]+)/1[^>]*>(?<title>.*?)</a>", RegexOptions.IgnoreCase); 11 12 MatchCollection mcTable = regTable.Matches(html); 13 foreach (Match mTable in mcTable) 14 { 15 if (mTable.Success) 16 { 17 Keyword keyword = new Keyword(); 18 keyword.ID = i++; 19 keyword.Title = Regex.Replace(mTable.Groups["content"].Value, "<[^>]*>", string.Empty); 20 keyword.Link = mTable.Groups["url"].Value; 21 keywords.Add(keyword); 22 } 23 } 24 25 return keywords; 26 } 27 忘了.还有个导出Excel,这个友友们应该都有自己的方法,我这里就简单写了一个excel导出.也贴出来吧.
1 public void ExportDataGridViewToExcel(DataGridView dataGridview1)
2 { 3 SaveFileDialog saveFileDialog = new SaveFileDialog(); 4 saveFileDialog.Filter = "Execl files (*.xls)|*.xls"; 5 saveFileDialog.FilterIndex = 0; 6 saveFileDialog.RestoreDirectory = true; 7 saveFileDialog.CreatePrompt = true; 8 saveFileDialog.Title = "导出Excel文件"; 9 10 DateTime now = DateTime.Now; 11 saveFileDialog.FileName = now.Year.ToString().PadLeft(2) + now.Month.ToString().PadLeft(2, '0') + now.Day.ToString().PadLeft(2, '0') + "-" + now.Hour.ToString().PadLeft(2, '0') + now.Minute.ToString().PadLeft(2, '0') + now.Second.ToString().PadLeft(2, '0'); 12 saveFileDialog.ShowDialog(); 13 14 Stream myStream; 15 myStream = saveFileDialog.OpenFile(); 16 StreamWriter sw = new StreamWriter(myStream, System.Text.Encoding.GetEncoding("gb2312")); 17 string str = ""; 18 try 19 { 20 //写标题 21 for (int i = 0; i < dataGridview1.ColumnCount; i++) 22 { 23 if (i > 0) 24 { 25 str += "\t"; 26 } 27 str += dataGridview1.Columns[i].HeaderText; 28 } 29 sw.WriteLine(str); 30 //写内容 31 for (int j = 0; j < dataGridview1.Rows.Count; j++) 32 { 33 string tempStr = ""; 34 for (int k = 0; k < dataGridview1.Columns.Count; k++) 35 { 36 if (k > 0) 37 { 38 tempStr += "\t"; 39 } 40 tempStr += dataGridview1.Rows[j].Cells[k].Value.ToString(); 41 } 42 sw.WriteLine(tempStr); 43 } 44 sw.Close(); 45 myStream.Close(); 46 MessageBox.Show("导出成功"); 47 } 48 catch (Exception e) 49 { 50 MessageBox.Show(e.ToString()); 51 } 52 finally 53 { 54 sw.Close(); 55 myStream.Close(); 56 } 57 } 我把HTTpStatus类给贴出来..有需要demo的可以发邮件给我.或者留下邮箱 Httpstatus.cs class HttpState { private string _statusDescription; public string StatusDescription { get { return _statusDescription; } set { _statusDescription = value; } } /// <summary> /// 回调 址址, 登陆测试中使用 /// </summary> private string _callBackUrl; public string CallBackUrl { get { return _callBackUrl; } set { _callBackUrl = value; } } /// <summary> /// 网页网址 绝对路径格式 /// </summary> private string _url; public string Url { get { return _url; } set { _url = value; } } /// <summary> /// 字符串的形式的Cookie信息 /// </summary> private string _cookies; public string Cookies { get { return _cookies; } set { _cookies = value; } } /// <summary> /// Cookie信息 /// </summary> private CookieContainer _cookieContainer = new CookieContainer(); public CookieContainer CookieContainer { get { return _cookieContainer; } set { _cookieContainer = value; } } /// <summary> /// 网页源码 /// </summary> private string _html; public string Html { get { return _html; } set { _html = value; } } /// <summary> /// 验证码临时文件(绝对路径) /// </summary> private string _tmpValCodePic; public string TmpValCodePic { get { return _tmpValCodePic; } set { _tmpValCodePic = value; } } /// <summary> /// 验证码临时文件名(相对路径) /// </summary> private string _tmpValCodeFileName = "emptyPic.gif"; public string TmpValCodeFileName { get { return _tmpValCodeFileName; } set { _tmpValCodeFileName = value; } } /// <summary> /// 有验证码 /// </summary> private bool _isValCode; public bool IsValCode { get { return _isValCode; } set { _isValCode = value; } } /// <summary> /// 验证码URL /// </summary> private string _valCodeURL; public string ValCodeURL { get { return _valCodeURL; } set { _valCodeURL = value; } } /// <summary> /// 验证码识别后的值 /// </summary> private string _valCodeValue; public string ValCodeValue { get { return _valCodeValue; } set { _valCodeValue = value; } } /// <summary> /// 其它参数 /// </summary> private Hashtable _otherParams = new Hashtable(); public Hashtable OtherParams { get { return _otherParams; } set { _otherParams = value; } } // 重复添加处理 add by fengcj 09/11/19 PM public void addOtherParam(object key, object value) { if (!this.OtherParams.ContainsKey(key)) this.OtherParams.Add(key, value); else { this.OtherParams[key] = value; } } public void removeOtherParam(object key) { this.OtherParams.Remove(key); } public object getOtherParam(object key) { return this.OtherParams[key]; } }
KeyWord.cs
class Keyword { public int ID { get; set; } public string Title { get; set; } public string Link { get; set; } } 鉴于大家都需要demo,今天就整理一下发上来.添加了导出word,导出excel功能.晕...木找到怎么放文件路径进来....有需要的可以email我.
|
2023-10-27
2022-08-15
2022-08-17
2022-09-23
2022-08-13
请发表评论