C#写爬虫,版本V2.1
2018-06-22 07:05:55来源:未知 阅读 ()
这次是对2.0的小修补,2.0交互几乎没有,这次添加了进度条,和文本框,同时由于取得的链接主要会出现错误是:webResponse错误。
针对这种情况,设置了
try { webResponse = (HttpWebResponse)webRequest.GetResponse(); } catch(WebException ex) { webResponse = (HttpWebResponse)ex.Response; }
截取错误信息,这里我们不处理,后续直接判定statecode属性来决定是否还要执行下面的程序。
另外一点变化就是以前是通过将所获取的网页存到文本中去,这次
WebRequest myRequest = WebRequest.Create("http://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&fmq=1466307565574_R&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word=" + Uri.EscapeDataString(keyWord)); HttpWebResponse myResponse = (HttpWebResponse)myRequest.GetResponse(); if (myResponse.StatusCode == HttpStatusCode.OK) { Stream strm = myResponse.GetResponseStream(); StreamReader sr = new StreamReader(strm); string line = sr.ReadToEnd();
将它全放入了string中。
最后一点是去掉了DownloadPage这个方法,如上,它的功能可以放入按钮的单击事件中实现,没有必要把一件事做两遍。
下面是前台页面:
后台代码:
using Newtonsoft.Json; using Newtonsoft.Json.Linq; using System; using System.Collections.Generic; using System.ComponentModel; using System.Data; using System.Drawing; using System.IO; using System.Linq; using System.Net; using System.Text; using System.Text.RegularExpressions; using System.Threading.Tasks; using System.Windows.Forms; namespace 百度图片爬虫V2._1 { public partial class Form1 : Form { public delegate void AsynFunction(string s,int i); public Form1() { InitializeComponent(); } private static string[] getLinks(string html, out int counts) { const string pattern = @"http://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?"; Regex r = new Regex(pattern, RegexOptions.IgnoreCase); //新建正则模式 MatchCollection m = r.Matches(html); //获得匹配结果 string[] links = new string[m.Count]; int count = 0; for (int i = 0; i < m.Count; i++) { if (isValiable(m[i].ToString())) { links[count] = m[i].ToString(); //提取出结果 count++; } } counts = count; return links; } private void button1_Click(object sender, EventArgs e) { string keyWord = this.textBox1.Text; WebRequest myRequest = WebRequest.Create("http://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&fmq=1466307565574_R&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word=" + Uri.EscapeDataString(keyWord)); HttpWebResponse myResponse = (HttpWebResponse)myRequest.GetResponse(); if (myResponse.StatusCode == HttpStatusCode.OK) { Stream strm = myResponse.GetResponseStream(); StreamReader sr = new StreamReader(strm); string line = sr.ReadToEnd(); int counts = 0; string[] str = getLinks(line, out counts); this.progressBar1.Maximum = counts; for (int i = 0; i < counts; i++) { AsynFunction fun = new AsynFunction(savePicture); fun.BeginInvoke(str[i],i, ar => { fun.EndInvoke(ar); this.progressBar1.BeginInvoke(new Action(() => { this.progressBar1.Value =progressBar1.Maximum; })); this.textBox2.BeginInvoke(new Action(() => { StringBuilder sb=new StringBuilder(); sb.Append(Environment.NewLine); // sb.Append(str[i].ToString()); sb.Append("下载结束"); this.textBox2.Text += sb.ToString(); })); }, fun); } } } private static bool isValiable(string url) { if (url.Contains(".jpg") || url.Contains(".gif") || url.Contains(".png")) { return true; //得到一些图片之类的资源 } return false; } public void savePicture(string path,int i) { if (path != "" && path != null) { DataClasses1DataContext db = new DataClasses1DataContext(); Uri url = new Uri(path); HttpWebRequest webRequest = (HttpWebRequest)HttpWebRequest.Create(url); webRequest.Referer = "http://image.baidu.com"; webRequest.Timeout = 30000; //设置连接超时时间 webRequest.AllowAutoRedirect = true; webRequest.Headers.Set("Pragma", "no-cache"); webRequest.UserAgent = "Mozilla-Firefox-Spider(Wenanry)"; HttpWebResponse webResponse; try { webResponse = (HttpWebResponse)webRequest.GetResponse(); } catch(WebException ex) { webResponse = (HttpWebResponse)ex.Response; } if(webResponse!=null&&webResponse.StatusCode==HttpStatusCode.OK) { if (isValiable(path))//判断如果是图片,就将其存储到数据库中。 { Bitmap myImage = new Bitmap(webResponse.GetResponseStream()); MemoryStream ms = new MemoryStream(); myImage.Save(ms, System.Drawing.Imaging.ImageFormat.Jpeg); var p = new pictureUrl { pictureUrl1 = ms.ToArray() }; db.pictureUrl.InsertOnSubmit(p); db.SubmitChanges(); this.progressBar1.BeginInvoke(new Action(() => { this.progressBar1.Value = i; })); this.textBox2.BeginInvoke(new Action(() => { StringBuilder sb1 = new StringBuilder(); sb1.Append(path); sb1.Append("图片下载开始" + Environment.NewLine); this.textBox2.Text += sb1.ToString(); })); } } } } private void button2_Click(object sender, EventArgs e) { this.Close(); } } }
标签:
版权申明:本站文章部分自网络,如有侵权,请联系:west999com@outlook.com
特别注意:本站所有转载文章言论不代表本站观点,本站所提供的摄影照片,插画,设计作品,如需使用,请与原作者联系,版权归原作者所有
- 浅谈ASP.Net Core WebApi几种版本控制对比 2019-12-10
- 【转载】如何查看sqlserver客户端的版本号信息 2019-07-23
- 【转载】ASP.NET以Post方式抓取远程网页内容类似爬虫功能 2019-03-13
- SignalR 2 入门 2018-12-17
- Asp.Net 学习笔记(IIS不同版本和Asp.Net) 2018-10-11
IDC资讯: 主机资讯 注册资讯 托管资讯 vps资讯 网站建设
网站运营: 建站经验 策划盈利 搜索优化 网站推广 免费资源
网络编程: Asp.Net编程 Asp编程 Php编程 Xml编程 Access Mssql Mysql 其它
服务器技术: Web服务器 Ftp服务器 Mail服务器 Dns服务器 安全防护
软件技巧: 其它软件 Word Excel Powerpoint Ghost Vista QQ空间 QQ FlashGet 迅雷
网页制作: FrontPages Dreamweaver Javascript css photoshop fireworks Flash