php单线程爬虫类
- 代码:
/** * @desc:单线程爬虫类 * @author [Lee] <[
]> * @property * 1、callcontent 获取给定url页面中的内容的回调函数 * 2、calltodo 处理业务逻辑的回调函数 如:把抓取到的内容处理后存到数据库 * @method * run 执行爬虫程序 * @param depth 深度 默认2 * @return void */ class crawl{ public $callcontent = 'getcontent'; # 获取给定url页面中的内容的回调函数 public $calltodo = 'todo'; # 处理业务逻辑的回调函数 如:把抓取到的内容处理后存到数据库 private $url; # 内部属性:当前处理中的url /* @desc:内部方法,调用回调函数获取页面内容 @param url 传入到回调函数的参数 @return ret 页面内容 */ private function getcontent($url){ $callback = $this->callcontent; $ret = call_user_func($callback,$url); return $ret; } /* @desc:内部方法,调用回调函数进行业务处理 @param content 传入到回调函数的参数 */ private function todo($content){ $callback = $this->calltodo; call_user_func($callback,$content); } /* @desc:内部方法,获取页面中的超链接 @param content 页面内容 @return urls 获取到的超链接 */ private function geturl($content){ $preg = '/<[a|A].*?href=[\'\"]{0,1}([^>\'\"\ ]*).*?>/'; $bool = preg_match_all($preg,$content,$res); $urls = array(); if($bool){ $urls = $res[1]; } $urls = array_unique($urls); return $urls; } /* @desc:内部方法,修复不完整的url @param url 原始url @param url 修复好的url */ private function reviseurl($url){ $info = parse_url($url); $scheme = $info["scheme"]?:'http'; $user = $info["user"]; $pass = $info["pass"]; $host = $info["host"]; $port = $info["port"]; $path = $info["path"]; $url = $scheme . '://'; if ($user && $pass) { $url .= $user . ":" . $pass . "@"; } $url .= $host; if ($port) { $url .= ":" . $port; } $url .= $path; return $url; } /* @desc:构造方法,初始化url */ public function __construct($url){ $this->url = $url; } /* @desc:主方法,执行程序 @param depth 挖掘深度 默认2 */ public function run($depth = 2){ $url = $this->url; if($depth > 0){ $depth--; $content = $this->getcontent($url); // 业务处理开始 $this->todo($content); // 业务处理结束 $urls = $this->geturl($content); $url = $this->reviseurl($url); if (is_array($urls) && !empty($urls)) { foreach ($urls as $u) { if (preg_match('/^http/', $u)) { $returl = $u; } else { $real = $url . '/' . $u; $returl = $real; } $crawl = new crawl($returl); $crawl->run($depth); } } } } } - 测试:
$scrawl = new scrawl('https://blog.51cto.com/12173069'); $scrawl->run(1); /* @desc:获取内容的回调 */ function getcontent($url){ $content = file_get_contents($url); return $content; } /* @desc:处理业务逻辑的回调 */ function todo($content){ $preg = '/<[a|A].*?href=[\'\"]{0,1}([^>\'\"\ ]*).*?>/i'; $bool = preg_match_all($preg,$content,$res); $urls = array(); if($bool){ $urls = $res[1]; } $urls = array_unique($urls); var_dump($urls); }
- 输出:
array(72) { [0]=> string(22) "https://blog.51cto.com/" [2]=> string(30) "https://blog.51cto.com/original" [3]=> string(34) "https://blog.51cto.com/cloumn/index" [4]=> string(28) "https://blog.51cto.com/expert" [5]=> string(35) "https://blog.51cto.com/blogger/index" [6]=> string(19) "javascript:void(0);" [7]=> string(20) "http://edu.51cto.com" [8]=> string(21) "https://blog.51cto.com" [9]=> string(21) "http://down.51cto.com" [10]=> string(21) "http://home.51cto.com" [11]=> string(20) "http://bbs.51cto.com" [12]=> string(18) "http://x.51cto.com" [13]=> string(0) "" [14]=> string(20) "http://wot.51cto.com" [15]=> string(20) "http://www.51cto.com" [16]=> string(89) "http://home.51cto.com/user/register?reback=http%253A%252F%252Fblog.51cto.com%252F12173069" [17]=> string(78) "https://blog.51cto.com/user/login?reback=http%3A%2F%2Fblog.51cto.com%2F12173069" [18]=> string(12) "javascript:;" [19]=> string(34) "https://blog.51cto.com/search/index" [23]=> string(40) "http://home.51cto.com/space?uid=12163069" [27]=> string(37) "https://blog.51cto.com/12173069?type=1" [28]=> string(37) "https://blog.51cto.com/12173069?type=2" [29]=> string(37) "https://blog.51cto.com/12173069?type=3" [30]=> string(30) "https://blog.51cto.com/12173069" [37]=> string(33) "https://blog.51cto.com/12173069?s=" [38]=> string(34) "https://blog.51cto.com/12173069?s=3" [39]=> string(34) "https://blog.51cto.com/12173069?s=4" [40]=> string(34) "https://blog.51cto.com/12173069?s=5" [41]=> string(34) "https://blog.51cto.com/12173069?s=6" [50]=> string(38) "https://blog.51cto.com/12173069/2126752" [55]=> string(38) "https://blog.51cto.com/12173069/2126693" [60]=> string(38) "https://blog.51cto.com/12173069/2126661" [65]=> string(38) "https://blog.51cto.com/12173069/2126657" [70]=> string(38) "https://blog.51cto.com/12173069/2126596" [75]=> string(38) "https://blog.51cto.com/12173069/2126591" [80]=> string(38) "https://blog.51cto.com/12173069/2126496" [85]=> string(38) "https://blog.51cto.com/12173069/2126420" [90]=> string(38) "https://blog.51cto.com/12173069/2126324" [95]=> string(38) "https://blog.51cto.com/12173069/2126210" [100]=> string(38) "https://blog.51cto.com/12173069/2126090" [105]=> string(38) "https://blog.51cto.com/12173069/2125724" [110]=> string(38) "https://blog.51cto.com/12173069/2125666" [115]=> string(38) "https://blog.51cto.com/12173069/2125424" [120]=> string(38) "https://blog.51cto.com/12173069/2125359" [125]=> string(38) "https://blog.51cto.com/12173069/2124937" [130]=> string(38) "https://blog.51cto.com/12173069/2124923" [135]=> string(38) "https://blog.51cto.com/12173069/2124720" [140]=> string(38) "https://blog.51cto.com/12173069/2124693" [145]=> string(38) "https://blog.51cto.com/12173069/2124499" [147]=> string(33) "https://blog.51cto.com/12173069/p1" [148]=> string(33) "https://blog.51cto.com/12173069/p2" [149]=> string(33) "https://blog.51cto.com/12173069/p3" [150]=> string(33) "https://blog.51cto.com/12173069/p4" [151]=> string(33) "https://blog.51cto.com/12173069/p5" [152]=> string(33) "https://blog.51cto.com/12173069/p6" [153]=> string(33) "https://blog.51cto.com/12173069/p7" [154]=> string(33) "https://blog.51cto.com/12173069/p8" [156]=> string(34) "https://blog.51cto.com/12173069/p19" [159]=> string(39) "https://blog.51cto.com/ityouknow/2124403" [160]=> string(35) "https://blog.51cto.com/wyait/2125708" [161]=> string(39) "https://blog.51cto.com/lumay0526/2124116" [162]=> string(38) "https://blog.51cto.com/11010461/2123639" [163]=> string(35) "https://blog.51cto.com/qiuyt/2124456" [164]=> string(30) "https://blog.51cto.com/13716231" [166]=> string(30) "https://blog.51cto.com/13108471" [168]=> string(30) "https://blog.51cto.com/10316297" [170]=> string(30) "https://blog.51cto.com/13718637" [172]=> string(30) "https://blog.51cto.com/13681316" [174]=> string(20) "http://www.51CTO.com" [175]=> string(37) "https://blog.51cto.com/blogger/publish" [176]=> string(71) "http://wpa.qq.com/msgrd?v=3&uin=3591348659&site=qq&menu=yes" [177]=> string(39) "https://blog.51cto.com/51ctoblog/2057444" }
网页标题:php单线程爬虫类
URL网址:http://ybzwz.com/article/psoche.html