应用curl扩展抓取网页
2018-07-20 来源:open-open
<?php namespace Think; header("Content-Type: text/html;charset=utf-8"); class Mycurl { public $ch = null; public $data = null; public function __construct($url) { $this->ch = curl_init($url); curl_setopt($this->ch, CURLOPT_HEADER, false); //不返回头部信息 //将 curl_exec()获取的信息以文件流的形式返回,而不是直接输出。 curl_setopt($this->ch, CURLOPT_RETURNTRANSFER, true); $this->data = curl_exec($this->ch); } public function __destruct() //释放资源 { curl_close($this->ch); } public function regmatch() //正则方式抓取 { $reg = '/(?<=<title>)(.*)(?=<\/title>)/i'; //抓取标题 $reg = '/<div\sid="article_content"\sclass="article_content">([^(?<\/div>)]*)<\/div>/si'; //抓取文章内容 preg_match($reg,$this->data,$out); return $out[1]; } public function result($pos1,$pos2) //字符串方式抓取 { $len = strlen($pos1); $flag1 = stripos($this->data, $pos1); $flag2 = stripos($this->data, $pos2); $str = substr($this->data,$flag1,$flag2-$flag1); return $str; } public function exec() //获取抓取数据 { $data = Array(); $data['title'] = self::result('<title>','-卢松松博客</title>'); $data['title'] = substr($data['title'],7); //参数7偏移是为了过滤上一步字符串抓取结果中的前面<title> $data['content'] = self::result('<dd class="post-info">','<center>'); $data['content'] = str_ireplace("/upload/","http://lusongsong.com/upload/",$data['content']); //这一步解决抓取文章的图片地址错误 $data['content'] = str_ireplace("http://lusongsong.comhttp://lusongsong.com","http://lusongsong.com",$data['content']); //解决上一步产生的副作用, $data['content'] = str_ireplace("bloghttp://lusongsong.com","blog",$data['content']); //继续解决上两步产生的副作用 $data['atime'] = time(); $data['author'] = 'Internet'; $data['sort'] = '精彩博文'; // $data['oldlink'] = ''; $data['summary'] = substr(strip_tags($data['content']),0,180); //截取文章摘要 return $data; } } // 测试 $url = 'http://lusongsong.com/reed/'; $num = 100; //住区文章数目 $start = 350; //抓取起点 $Art = M('article'); for($i=$start; $i < $start+$num ; $i++) { $posurl = $url.$i.'.html'; $curl = new Mycurl($posurl); $data = $curl->exec(); $data['oldlink'] = $posurl; if($pos = strpos($data['title'], "出现404错误页面了")) { continue; } $Art->add($data); $curl = null; } $this->success("执行完成!","index"); ?>
标签:
版权申明:本站文章部分自网络,如有侵权,请联系:west999com@outlook.com
特别注意:本站所有转载文章言论不代表本站观点!
本站所提供的图片等素材,版权归原作者所有,如需使用,请与原作者联系。
上一篇:C++ 计算n天后的日期
下一篇:关键路径算法C++实现代码
最新资讯
热门推荐