在线时间:8:00-16:00
迪恩网络APP
随时随地掌握行业动态
扫描二维码
关注迪恩网络微信公众号
很多人用python去抓数据,语法简单又方便。我这里用C#去抓数据,python后期自学下,看下能写出一些好文吗?
第一步 先把应用程序发布iis服务器上 第二步 用谷歌浏览器抓下网络请求 观察头文件分析后 确认是post提交的。 请求一般看这些参数 status code 200 说明是正常的没有做跳转的包,长度是根据表单提交的参数计算出来了,如果发现计算的长度和抓取的包的长度不对 有可能是传递表单参数传入有问题 需要排查, 还有content-type 这个有可能会是application/json类型,传递参数那就是json类型了。 2.1 请求 2.2 响应 注意红框就是我想要拿到响应的数据,但是多了很多无用的html标签,这些无用的标签需要替换 参考文献: 正则表达式,去除所有HTML标签 https://www.cnblogs.com/caok168/articles/2567117.html 第三步 编写相关代码 1 using MyAutomaticRefund.Common; 2 using Newtonsoft.Json; 3 using System; 4 using System.Collections.Generic; 5 using System.ComponentModel; 6 using System.Data; 7 using System.Drawing; 8 using System.IO; 9 using System.Linq; 10 using System.Net; 11 using System.Text; 12 using System.Text.RegularExpressions; 13 using System.Threading.Tasks; 14 using System.Windows.Forms; 15 16 namespace MyAutomaticRefund 17 { 18 public partial class Form1 : Form 19 { 20 static CookieContainer m_Cookie = null; 21 22 public Form1() 23 { 24 InitializeComponent(); 25 } 26 27 private void button1_Click(object sender, EventArgs e) 28 { 29 string result = CaptureData(); 30 31 string regexstr = @"<[^>]*>"; //去除所有的标签 32 result = Regex.Replace(result, regexstr, string.Empty, RegexOptions.IgnoreCase); 33 MessageBox.Show(result.Trim()); 34 } 35 36 /// <summary> 37 /// 抓取按钮事件 38 /// </summary> 39 /// <returns></returns> 40 private string CaptureData() 41 { 42 43 string result = ""; 44 try 45 { 46 string Url = "http://192.168.21.195:8044/WebForm1"; 47 HttpWebRequest request = WebRequest.Create(Url) as HttpWebRequest; 48 request.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3"; 49 request.UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36"; 50 request.Method = "POST"; 51 request.KeepAlive = true; 52 request.Referer = "http://192.168.21.195:8044/WebForm1"; 53 request.ContentType = "application/x-www-form-urlencoded"; 54 request.Headers.Add("Accept-Encoding", "gzip, deflate"); 55 request.Headers.Add("Accept-Language", "zh-CN,zh;q=0.9"); 56 //request.Headers.Add("X-Requested-With", "XMLHttpRequest"); 57 request.CookieContainer = m_Cookie; 58 request.Host = "192.168.21.195:8044"; 59 60 HttpRequestClient s = new HttpRequestClient(true); 61 62 63 Dictionary<string, string> dic = new Dictionary<string, string>(); 64 dic.Add("__VIEWSTATE", "BPyDUNYWaU0XKZc1m/+hb4U4rlITcqsuUMRxGp8elUR/ZgK5owxYH030QThFNas5u/roAT5kBYSSMaMYwRFEcw1COVEwFNapeyfnLq2PZ98="); 65 dic.Add("__VIEWSTATEGENERATOR","B6E7D48B"); 66 dic.Add("__EVENTVALIDATION", "3d+F/uurjOe90j09vxq1PU5Tli3EVssuM/HOISX5tyLUP2HdR956+2PC8ezRt2kDXMr1BDkstuMTV11I0laFwL+adHjgxQYtfEObtF/NligPunXkuGz4LuJ6VYdtACGg"); 67 dic.Add("Button1", "Button"); 68 69 string formData = ConvertStrDic(dic); 70 //string formData = "__VIEWSTATE=BPyDUNYWaU0XKZc1m%2F%2Bhb4U4rlITcqsuUMRxGp8elUR%2FZgK5owxYH030QThFNas5u%2FroAT5kBYSSMaMYwRFEcw1COVEwFNapeyfnLq2PZ98%3D&__VIEWSTATEGENERATOR=B6E7D48B&__EVENTVALIDATION=3d%2BF%2FuurjOe90j09vxq1PU5Tli3EVssuM%2FHOISX5tyLUP2HdR956%2B2PC8ezRt2kDXMr1BDkstuMTV11I0laFwL%2BadHjgxQYtfEObtF%2FNligPunXkuGz4LuJ6VYdtACGg&Button1=Button"; 71 byte[] data = Encoding.UTF8.GetBytes(formData); 72 73 request.ContentLength = formData.Length; 74 using (Stream reqStream = request.GetRequestStream()) 75 { 76 reqStream.Write(data, 0, data.Length); 77 reqStream.Close(); 78 } 79 80 //响应 81 HttpWebResponse response = (HttpWebResponse)request.GetResponse(); 82 int statusInt = response.StatusCode.GetHashCode(); 83 //响应成功 84 if (response.StatusCode.ToString().ToLower() == "ok") 85 { 86 result = s.getResponseBody(response); 87 return result; 88 } 89 return result; 90 } 91 catch (Exception ex) 92 { 93 return result; 94 } 95 96 } 97 98 string ConvertStrDic(Dictionary<string, string> dic) 99 { 100 StringBuilder formdata = new StringBuilder(); 101 foreach (KeyValuePair<string, string> item in dic) 102 { 103 formdata.Append(item.Key).Append("=").Append(System.Web.HttpUtility.UrlEncode(item.Value)).Append("&");//注意传递的参数需要编码 104 } 105 return formdata.ToString().TrimEnd('&'); 106 } 107 108 109 public class formTable 110 { 111 public string __VIEWSTATE { get; set; } 112 public string __VIEWSTATEGENERATOR { get; set; } 113 public string __EVENTVALIDATION { get; set; } 114 public string Button1 { get; set; } 115 } 116 } 117 } 运行结果
用python语言 怎么写这里我用到的库是 urllib 用到了request ,urlopen等语法 #第二种思路 import urllib.parse import urllib.request #data ="__VIEWSTATE=BPyDUNYWaU0XKZc1m%2F%2Bhb4U4rlITcqsuUMRxGp8elUR%2FZgK5owxYH030QThFNas5u%2FroAT5kBYSSMaMYwRFEcw1COVEwFNapeyfnLq2PZ98%3D&__VIEWSTATEGENERATOR=B6E7D48B&__EVENTVALIDATION=3d%2BF%2FuurjOe90j09vxq1PU5Tli3EVssuM%2FHOISX5tyLUP2HdR956%2B2PC8ezRt2kDXMr1BDkstuMTV11I0laFwL%2BadHjgxQYtfEObtF%2FNligPunXkuGz4LuJ6VYdtACGg&Button1=Button" str = { '__VIEWSTATE': 'i8VVE3gtBLKgjBpYFwMCruW86sOjv2lTpmzZF3mD3L/QIkX0Ode3Xc9MaMXFQnjiAK80xxkQ9rjTyjGrBWvbLwcxug4r9Akhmcxs/plCdYM=', '__VIEWSTATEGENERATOR':'B6E7D48B', '__EVENTVALIDATION':'2X6ageL+LlYeiTSgyQPd/FPAhPtg350MNiiKvURoS4xMsysX+0HyGjGN93yx7K27/NubbDHt2oTpWbFCv1dampTLrZkWvsf32lHlqJUliAsudoGhZ3kwM/XCxXSlvhNr', 'Button1':'Button' } # headers={ # 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36' # } # str = {'__VIEWSTATE': 'i8VVE3gtBLKgjBpYFwMCruW86sOjv2lTpmzZF3mD3L/QIkX0Ode3Xc9MaMXFQnjiAK80xxkQ9rjTyjGrBWvbLwcxug4r9Akhmcxs/plCdYM=','__VIEWSTATEGENERATOR':'B6E7D48B','__EVENTVALIDATION':'2X6ageL+LlYeiTSgyQPd/FPAhPtg350MNiiKvURoS4xMsysX+0HyGjGN93yx7K27/NubbDHt2oTpWbFCv1dampTLrZkWvsf32lHlqJUliAsudoGhZ3kwM/XCxXSlvhNr','Button1':'Button'} data = bytes(urllib.parse.urlencode(str),encoding='utf-8') url = "http://192.168.21.195:8044/WebForm1" # request = urllib.request.Request(url=url,data=data,headers=headers,origin_req_host='192.168.21.195:8044',unverifiable=True,method='POST') #第一种写法 # request = urllib.request.Request(url=url,data=data) # response = urllib.request.urlopen(request) # print(response.read().decode('utf-8')) #第二种写法 response = urllib.request.urlopen(url,data=data) print(response.read().decode('utf-8')) # print(response.read()) 不设置编码格式是不能拿到想要的数据的 太乱了,整理一下把调试代码拿掉 #第二种思路 import urllib.parse import urllib.request str = { '__VIEWSTATE': 'i8VVE3gtBLKgjBpYFwMCruW86sOjv2lTpmzZF3mD3L/QIkX0Ode3Xc9MaMXFQnjiAK80xxkQ9rjTyjGrBWvbLwcxug4r9Akhmcxs/plCdYM=', '__VIEWSTATEGENERATOR':'B6E7D48B', '__EVENTVALIDATION':'2X6ageL+LlYeiTSgyQPd/FPAhPtg350MNiiKvURoS4xMsysX+0HyGjGN93yx7K27/NubbDHt2oTpWbFCv1dampTLrZkWvsf32lHlqJUliAsudoGhZ3kwM/XCxXSlvhNr', 'Button1':'Button' } data = bytes(urllib.parse.urlencode(str),encoding='utf-8') url = "http://192.168.21.195:8044/WebForm1" #第二种写法 response = urllib.request.urlopen(url,data=data) print(response.read().decode('utf-8'))
运行效果
再优化下正则去除多余html标签 import re resstr=response.read().decode('utf-8') new_st = re.sub(r'<[^>]*>','',resstr) print(new_st.split())
总结:1,测试后不需要添加头文件。 2,抓数据方面,python语法比C#语法简单. 3,python语法注意事项:data里面是键值队形式和json一致 |
请发表评论