php 爬虫 thinkphp框架调用http类,批量保存图片

爬了一个宠物网站:

用的是php代码 

框架tp5

 

 

代码如下:

<?php

namespace app\api\controller;

use fast\Http;//引入这个类

use app\common\controller\Api;

/**
 * 示例接口
 */
class Demo extends Api
{

    //如果$noNeedLogin为空表示所有接口都需要登录才能请求
    //如果$noNeedRight为空表示所有接口都需要验证权限才能请求
    //如果接口已经设置无需登录,那也就无需鉴权了
    //
    // 无需登录的接口,*表示全部
    protected $noNeedLogin = ['*'];
    // 无需鉴权的接口,*表示全部
    protected $noNeedRight = ['*'];

    /**
     * 测试方法
     *
     * @ApiTitle    (测试名称)
     * @ApiSummary  (测试描述信息)
     * @ApiMethod   (POST)
     * @ApiRoute    (/api/demo/test/id/{id}/name/{name})
     * @ApiHeaders  (name=token, type=string, required=true, description="请求的Token")
     * @ApiParams   (name="id", type="integer", required=true, description="会员ID")
     * @ApiParams   (name="name", type="string", required=true, description="用户名")
     * @ApiParams   (name="data", type="object", sample="{'user_id':'int','user_name':'string','profile':{'email':'string','age':'integer'}}", description="扩展数据")
     * @ApiReturnParams   (name="code", type="integer", required=true, sample="0")
     * @ApiReturnParams   (name="msg", type="string", required=true, sample="返回成功")
     * @ApiReturnParams   (name="data", type="object", sample="{'user_id':'int','user_name':'string','profile':{'email':'string','age':'integer'}}", description="扩展数据返回")
     * @ApiReturn   ({
         'code':'1',
         'msg':'返回成功'
        })
     */
    public function test()
    {   
        $this->domain = 'http://www.boqii.com';
        $type=['dog'=>'狗狗','cat'=>'猫咪','smallpet'=>'小宠','aquarium'=>'水族','reptile'=>'爬虫'];
       
        $now_path = getcwd();//目录

        // halt(__FILE__);
        foreach ($type as $enname => $zhname) {//循环分类
            $url = $this->domain.'/pet-all/'.$enname.'/';
            echo $zhname;
            // $get = $Http->get($this->domain.'/pet-all/dog/');
            $max_page = $this->pd_max_page($url);

            $url_page = $url.'?p='.$max_page;
            // 当前的宠物类别的目录
            $cwpath = $now_path.'/cw/'.$zhname;
            $this->pd_dir($cwpath);

            for ($i=1; $i <=intval($max_page) ; $i++) { //循环页码
                //循环下图片
                //打开网页   获取源码   下载图片
                $yuanma = $this->get($url.'?p='.$i);
                $img_arr = $this->re_img($yuanma);
                foreach($img_arr as $name=>$vurl){
                    // 
                    // exit();
                    $path = $cwpath.'/'.$name;
                    // file_put_contents($path,$vurl);

                    $this->download($vurl,$cwpath,$name.".png",$name);
                }
                
            }
            // dump($max_page);
            sleep(4);
            echo "<br>{$zhname}结束了";
        }
    // $this->success('返回成功QQ496631085', $this->request->param());
    }

    /**
     * 找最大页码
     *
     */
    public function pd_max_page($url)
    {   
        // $Http= new Http();
        // $yuanma = $Http->get($url);

        $yuanma = $this->get($url);
        //正则pet-all/dog/?p=2'
        $lastPattern ='/\?p\=([0-9]{1,2})/';
        preg_match_all($lastPattern, $yuanma, $end);//找期数

        $max = 1;
        foreach ($end[1] as $key => $value) {
            if(intval($value)>$max){
                $max = $value;
            }
        }
        return $max;
    }






    /**
     * 正则图片地址
     *
     */
    public function re_img($str)
    {//<img alt="柯基犬" src="http://img.boqiicdn.com/Data/BK/P/imagick3061541499071.png" />
    //<a target="_blank" href="http://www.boqii.com/entry/detail/425.html"><img alt="泰迪犬" src="http://img.boqiicdn.com/Data/BK/P/imagick441541497241.jpg" /></a>
        $re ='/\<a .*?href="(.*?)"\>\<img alt="(.*?)" src="(.*?)" \/\>\<\/a\>/';
        preg_match_all($re, $str, $end);//找期数
        if($end){
            $all = array();
            foreach($end[2] as $key => $zhname){
                $all[$zhname] = $end[3][$key];
            }
            return $all;
        }else{
            return 0;
        }
        
       
    }

    
    //判断目录是否存在,不存在就创建
    public function pd_dir($path){
        if (!is_dir($path)){  
           //第三个参数是“true”表示能创建多级目录,iconv防止中文目录乱码
           $res=mkdir(iconv("UTF-8", "GBK", $path),0777,true); 
           if ($res){
               echo "目录 $path 创建成功<br>";
           }else{
               echo "目录 $path 创建失败";
               exit();
           }
        }
    }

    //get获取源码
    public function get($url){
        $Http= new Http();
        $yuanma = $Http->get($url);
        return $yuanma;
    }

    //下载图片的方法 ,待优化,随便拿取的一个,
    public function download($url, $save_dir = './cw/',$filename='',$name)
    {
        if(trim($save_dir)=='')
            $save_dir='./cw/';

        if(trim($filename)==''){//保存文件名
            $allowExt = array('gif', 'jpg', 'jpeg', 'png', 'bmp');
            $ext=strrchr($url,'.');
            if(!in_array($ext,$allowExt))
                return array('file_name'=>'','save_path'=>'','error'=>3);

            $filename=time().$ext;
        }
        if(0!==strrpos($save_dir,'/'))
            $save_dir.='/';

        //创建保存目录
        if(!file_exists($save_dir)&&!mkdir($save_dir,0777,true))
            return array('file_name'=>'','save_path'=>'','error'=>5);

        $ch = curl_init();
        curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); // 信任任何证书
        curl_setopt($ch, CURLOPT_URL, $url);
        curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
        curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 30);
        $file = curl_exec($ch);
        curl_close($ch);
        $filename = pathinfo($url);
        $resource = fopen($save_dir . $name.'.'.$filename['extension'], 'a');
        fwrite($resource, $file);
        fclose($resource);
        unset($file,$url);
        // return array('file_name'=>$filename,'save_path'=>$save_dir.$filename,'error'=>0);
    }

    
}

一个功能块一个方法 ,这样后期方便调用 修改

有什么不懂的可以咨询!

1679
主攻后端PHP
4660
二次开发Code
1679
长连接Swoole
4640
高并发网站Redis