Multi 多线程插件


Multi扩展,可以实现多线程采集。

安装:

  1. composer require jaeger/querylist-ext-multi

GIT地址:

  1. https://github.com/jae-jae/QueryList-Ext-Multi.git

依赖(通过Composer安装的请忽略)

Multi扩展依赖CurlMulti类,Git地址为:https://github.com/jae-jae/CurlMulti

用法一

  1. <?php
  2. /**
  3. * 下面实现多线程采集文章信息
  4. */
  5. use QL\QueryList;
  6. //多线程扩展
  7. QueryList::run('Multi',[
  8. //待采集链接集合
  9. 'list' => [
  10. 'http://cms.querylist.cc/news/it/547.html',
  11. 'http://cms.querylist.cc/news/it/545.html',
  12. 'http://cms.querylist.cc/news/it/543.html'
  13. //更多的采集链接....
  14. ],
  15. 'curl' => [
  16. 'opt' => array(
  17. //这里根据自身需求设置curl参数
  18. CURLOPT_SSL_VERIFYPEER => false,
  19. CURLOPT_SSL_VERIFYHOST => false,
  20. CURLOPT_FOLLOWLOCATION => true,
  21. CURLOPT_AUTOREFERER => true,
  22. //........
  23. ),
  24. //设置线程数
  25. 'maxThread' => 100,
  26. //设置最大尝试数
  27. 'maxTry' => 3
  28. ],
  29. 'success' => function($a){
  30. //采集规则
  31. $reg = array(
  32. //采集文章标题
  33. 'title' => array('h1','text'),
  34. //采集文章正文内容,利用过滤功能去掉文章中的超链接,但保留超链接的文字,并去掉版权、JS代码等无用信息
  35. 'content' => array('.post_content','html','a -.content_copyright -script' )
  36. );
  37. $rang = '.content';
  38. $ql = QueryList::Query($a['content'],$reg,$rang);
  39. $data = $ql->getData();
  40. //打印结果,实际操作中这里应该做入数据库操作
  41. print_r($data);
  42. }
  43. ]);

用法二

  1. <?php
  2. require 'QueryList/vendor/autoload.php';
  3. use QL\QueryList;
  4. //多线程扩展
  5. $cm = QueryList::run('Multi',[
  6. //待采集链接集合
  7. 'list' => [
  8. 'http://cms.querylist.cc/news/it/547.html',
  9. 'http://cms.querylist.cc/news/it/545.html',
  10. 'http://cms.querylist.cc/news/it/543.html'
  11. //更多的采集链接....
  12. ],
  13. 'curl' => [
  14. 'opt' => array(
  15. CURLOPT_SSL_VERIFYPEER => false,
  16. CURLOPT_SSL_VERIFYHOST => false,
  17. CURLOPT_FOLLOWLOCATION => true,
  18. CURLOPT_AUTOREFERER => true,
  19. ),
  20. //设置线程数
  21. 'maxThread' => 100,
  22. //设置最大尝试数
  23. 'maxTry' => 3
  24. ],
  25. //不自动开始线程,默认自动开始
  26. 'start' => false,
  27. 'success' => function($html,$info){
  28. //采集操作....
  29. },
  30. 'error' => function(){
  31. //出错处理
  32. }
  33. ]);
  34. //再额外添加一些采集链接
  35. $cm->add([
  36. 'http://cms.querylist.cc/news/it/532.html',
  37. 'http://cms.querylist.cc/news/it/528.html',
  38. 'http://cms.querylist.cc/news/other/530.html'
  39. ],function($html,$info){
  40. //sucess
  41. //可选的,不同的采集操作....
  42. },
  43. function(){
  44. //error
  45. //可选的,不同的出错处理
  46. });
  47. //开始采集
  48. $cm->start();

用法三

  1. <?php
  2. require 'QueryList/vendor/autoload.php';
  3. use QL\QueryList;
  4. $url = 'http://www.phpddt.com/category/php/1/';
  5. $curl = QueryList::getInstance('QL\Ext\Lib\CurlMulti');
  6. //100个线程
  7. $curl->maxThread = 100;
  8. $data = QueryList::run('Request',array(
  9. 'http' =>array(
  10. 'target' => $url,
  11. 'referrer'=>$url,
  12. 'user_agent'=>'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.11 (KHTML, like Gecko) Ubuntu/11.10 Chromium/27.0.1453.93 Chrome/27.0.1453.93 Safari/537.36',
  13. 'cookiePath' => './cookie.txt'
  14. ),
  15. 'callback' => function($html){
  16. return preg_replace('/<head.+?>.+<\/head>/is','<head></head>',$html);
  17. }
  18. ))->setQuery(array('title'=>['h2 a','text'],'link'=>['h2 a','href']))->getData(function($item) use($curl){
  19. //判断数据库中是否存在数据
  20. if(!StudyModel::exist($item['title'])){
  21. $curl->add(['url' => $item['link']],function($a){
  22. $html = preg_replace('/<head.+?>.+<\/head>/is','<head></head>',$a['content']);
  23. $data = QueryList::Query($html,array('title'=>['.entry_title','text'],'content'=>['.post','html','-#headline -script -h3.post_tags -.copyright -.wumii-hook a']))->getData();
  24. //插入数据库
  25. StudyModel::insert($data[0]['title'],$data[0]['content'],$a['info']['url']);
  26. });
  27. }
  28. });
  29. $curl->start();