首页 > > 网络编程 > Mysql >

随手正则写的 CSDN【只看楼主】功能

2018-06-17 19:40:09来源：未知阅读 ()

写这个的时候居然没有看到原来CSDN已经有这个功能了，写完代码了突然发现原来早就已经有了。

现把代码贴出来吧，虽然有很多解析HTML的开源类库如:http://htmlagilitypack.codeplex.com/，但我一直习惯于正则匹配。

截图：

呵呵，起码还能看吧@——#

  1 private void button1_Click(object sender, EventArgs e)
  2         {
  3             if (!string.IsNullOrEmpty(txtCsdnUrl.Text.Trim()))
  4             {
  5                 string url = txtCsdnUrl.Text.Trim();
  6                 string htmlSource = string.Empty;
  7                 htmlSource = GetHtmlSource(url);
  8                 int pageCount = GetPageCount(htmlSource);
  9                 string context = string.Empty;
 10 
 11                 if (pageCount > 1)
 12                 {
 13                     for (int i = 1; i <= pageCount; i++)
 14                     {
 15                         htmlSource = GetHtmlSource(url + "?page=" + i);
 16 
 17                        context+= GetLZArticle(htmlSource);
 18                     }
 19                 }
 20                 else
 21                 {
 22                     context += GetLZArticle(htmlSource);
 23                 }
 24 
 25                 richTextBox1.Text = context;
 26 
 27             }
 28             else
 29             {
 30                 MessageBox.Show("请输入地址");
 31             }
 32         }
 33 
 34         /// <summary>
 35         /// 获取源代码
 36         /// </summary>
 37         /// <param name="Url"></param>
 38         /// <returns></returns>
 39         public string GetHtmlSource(string Url)
 40         {
 41             WebClient client = new WebClient();
 42             client.Headers.Add("user-agent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; .NET CLR 1.0.3705;)");
 43             Stream data = client.OpenRead(Url);
 44             string result = string.Empty;
 45             using (StreamReader reader = new StreamReader(data, Encoding.UTF8))
 46             {
 47                 result = reader.ReadToEnd();
 48             }
 49 
 50             return result;
 51         }
 52 
 53         /// <summary>
 54         /// 获取贴子总页数 URL格式:http://bbs.csdn.net/topics/390730011?page=2
 55         /// </summary>
 56         /// <returns>返回最大页数</returns>
 57         public int GetPageCount(string HtmlSource)
 58         {
 59             int pageCount = 0;
 60 
 61             Regex reg = new Regex("<select class=\"jumpMenu\" name=\"jumpMenu\">(?<val>.*?)</select>", RegexOptions.Singleline | RegexOptions.IgnoreCase);
 62             string htmlSource = HtmlSource;
 63             Regex reg1 = new Regex("<option.*?>(?<val>.*?)</option>", RegexOptions.Singleline | RegexOptions.IgnoreCase);
 64             int count = reg1.Matches(reg.Match(htmlSource).Groups["val"].Value).Count;
 65 
 66             int.TryParse(reg1.Matches(reg.Match(htmlSource).Groups["val"].Value)[count - 1].Groups["val"].Value,
 67                          out pageCount);
 68 
 69             return pageCount;
 70         }
 71 
 72         /// <summary>
 73         /// 获取文章标题
 74         /// </summary>
 75         /// <param name="HtmlSource">网页内容</param>
 76         /// <returns></returns>
 77         public string GetArticleTitle(string HtmlSource)
 78         {
 79             string title = string.Empty;
 80 
 81             Regex reg = new Regex("<span class=\"title text_overflow\">(?<title>.*?)</span>", RegexOptions.Singleline | RegexOptions.IgnoreCase);
 82 
 83             title = reg.Match(HtmlSource).Groups["title"].Value;
 84 
 85             return title;
 86         }
 87 
 88 
 89         public string GetAuthorName(string HtmlSource)
 90         {
 91             string result = string.Empty;
 92 
 93             Regex regex = new Regex("<a class=\"p-author\" href=\"#\">(?<value>.*?)</a>");
 94 
 95             result = regex.Match(HtmlSource).Groups["value"].Value;
 96 
 97             return result;
 98         }
 99 
100         public string GetLZArticle(string HtmlSource)
101         {
102 
103             string result = string.Empty;
104             string authorName = GetAuthorName(HtmlSource);
105 
106             Regex regex = new Regex("<td valign=\"top\" class=\"post_info .*?\" data-username=\"" + authorName + "\".*?>.*?<div class=\"post_body\">(?<value>.*?)</div>.*?</td>", RegexOptions.Singleline | RegexOptions.IgnoreCase);
107 
108             for (int i = 0; i < regex.Matches(HtmlSource).Count; i++)
109             {
110                 result += regex.Matches(HtmlSource)[i].Groups["value"].Value;
111                 result += "--------------------分隔线--------------------";
112             }
113             return result.Trim().Replace("<br />","\r\n");
114         }