• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    公众号

C#Html格式内容转Csv内容包括table(重点在rowspan和colspan合并),p,div元素 ...

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

Html格式内容转Csv内容,包括table(重点在rowspan和colspan合并),p,div元素,table不能包含嵌套功能。

  1 /// <summary>
  2 /// Html格式内容转Csv内容包括table(重点在rowspan和colspan合并),p,div元素
  3 /// </summary>
  4 /// <param name="hrml"></param>
  5 /// <returns></returns>
  6 private string HtmlToCsv(string hrml)
  7 {
  8     HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
  9     doc.LoadHtml(hrml);
 10     StringBuilder sbLines = new StringBuilder();
 11     HtmlAgilityPack.HtmlNodeCollection tList = doc.DocumentNode.SelectNodes("//table");
 12     if (tList != null)
 13     {
 14         foreach (HtmlAgilityPack.HtmlNode table in tList)
 15         {
 16             sbLines.AppendLine("#flag_table#,");
 17             HtmlAgilityPack.HtmlNodeCollection rows = table.SelectNodes("//tr");
 18             if (rows != null)
 19             {
 20                 int colCount = 0;
 21                 StringBuilder sbTable = new StringBuilder();
 22                 foreach (HtmlAgilityPack.HtmlNode td in rows[0].ChildNodes.Where(m => m.OriginalName.ToLower() == "td"))
 23                 {
 24                     HtmlAgilityPack.HtmlAttribute attr = td.Attributes["colspan"];
 25                     int colspan = (attr != null) ? int.Parse(attr.Value) : 1;
 26                     colCount = colCount + colspan;
 27                 }
 28                 int rowCount = rows.Count;
 29 
 30                 string[][] arr = new string[rowCount][];
 31                 for (int r = 0; r < rowCount; r++)
 32                 {
 33                     arr[r] = new string[colCount];
 34                 }
 35 
 36                 //填充区域
 37                 for (int r = 0; r < rowCount; r++)
 38                 {
 39                     HtmlAgilityPack.HtmlNode tr = rows[r];
 40                     List<HtmlAgilityPack.HtmlNode> cols = tr.ChildNodes.Where(m => m.OriginalName.ToLower() == "td").ToList();
 41 
 42                     int colspan = 0;
 43                     int rowspan = 0;
 44                     for (int c = 0; c < cols.Count; c++)
 45                     {
 46                         HtmlAgilityPack.HtmlAttribute cAttr = cols[c].Attributes["colspan"];
 47                         colspan = (cAttr != null) ? int.Parse(cAttr.Value) : 1;
 48                         HtmlAgilityPack.HtmlAttribute rAttr = cols[c].Attributes["rowspan"];
 49                         rowspan = (rAttr != null) ? int.Parse(rAttr.Value) : 1;
 50                         string text = cols[c].InnerText.Replace("&nbsp;", "").Replace(",", "").Replace("\r", "").Replace("\n", "").Trim();
 51 
 52                         if (colspan == 1 && rowspan == 1)
 53                         {
 54                             continue;
 55                         }
 56 
 57                         bool isFirst = true;
 58                         int rFill = r + rowspan;
 59                         for (int ri = r; ri < rFill; ri++)
 60                         {
 61                             int cFill = c + colspan;
 62                             for (int ci = c; ci < cFill; ci++)
 63                             {
 64                                 if (isFirst)
 65                                 {
 66                                     text = (text == string.Empty) ? " " : text;
 67                                     arr[ri][ci] = text;
 68                                     isFirst = false;
 69                                 }
 70                                 else
 71                                 {
 72                                     arr[ri][ci] = string.Empty;
 73                                 }
 74                             }
 75                         }
 76                     }
 77                 }
 78 
 79                 //填充单元
 80                 for (int r = 0; r < rowCount; r++)
 81                 {
 82                     HtmlAgilityPack.HtmlNode tr = rows[r];
 83                     List<HtmlAgilityPack.HtmlNode> cols = tr.ChildNodes.Where(m => m.OriginalName.ToLower() == "td").ToList();
 84                     Queue<string> queue = new Queue<string>();
 85                     for (int c = 0; c < cols.Count; c++)
 86                     {
 87                         string text = cols[c].InnerText.Replace("&nbsp;", "").Replace(",", "").Replace("\r", "").Replace("\n", "").Trim();
 88                         queue.Enqueue(text);
 89                     }
 90                     for (int c = 0; c < colCount; c++)
 91                     {
 92                         if (arr[r][c] == null)
 93                         {
 94                             string text = queue.Count > 0 ? queue.Dequeue() : string.Empty;
 95                             arr[r][c] = text;
 96                         }
 97                         else
 98                         {
 99                             if (arr[r][c] != string.Empty)
100                             {
101                                 if (queue.Count > 0)
102                                 {
103                                     queue.Dequeue();
104                                 }
105                             }
106                         }
107                     }
108                 }
109 
110                 //组装成cvs格式内容
111                 foreach (string[] cols in arr)
112                 {
113                     foreach (string col in cols)
114                     {
115                         sbLines.Append(col + ",");
116                     }
117                     sbLines.AppendLine(",");
118                 }
119                 table.RemoveAll();
120             }
121         }
122     }
123 
124     HtmlAgilityPack.HtmlNodeCollection pList = doc.DocumentNode.SelectNodes("//p");
125     if (pList != null)
126     {
127         sbLines.AppendLine("#flag_text#,");
128         foreach (HtmlAgilityPack.HtmlNode p in pList)
129         {
130             string text = p.InnerText.Replace("&nbsp;", "").Replace(",", "").Replace("\r", "").Replace("\n", "").Trim();
131             text = GetTextByHtml(text);
132             if (!string.IsNullOrWhiteSpace(text))
133             {
134                 sbLines.Append(text + ",");
135                 sbLines.AppendLine(",");
136             }
137             else
138             {
139                 sbLines.AppendLine(",");
140             }
141             p.RemoveAll();
142         }
143     }
144 
145     HtmlAgilityPack.HtmlNodeCollection dList = doc.DocumentNode.SelectNodes("//div");
146     if (pList != null)
147     {
148         sbLines.AppendLine("#flag_text#,");
149         foreach (HtmlAgilityPack.HtmlNode div in pList)
150         {
151             string text = div.InnerText.Replace("&nbsp;", "").Replace(",", "").Replace("\r", "").Replace("\n", "").Trim();
152             text = GetTextByHtml(text);
153             if (!string.IsNullOrWhiteSpace(text))
154             {
155                 sbLines.Append(text + ",");
156                 sbLines.AppendLine(",");
157             }
158             else
159             {
160                 sbLines.AppendLine(",");
161             }
162             //div.RemoveAll();
163         }
164     }
165     return sbLines.ToString();
166 }

 

html: 

 

csv:

 

url:http://www.cnblogs.com/dreamman/p/5343924.html

 


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
初学c++动态联编发布时间:2022-07-13
下一篇:
C#.NETSocket发布时间:2022-07-13
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap