爬了一个宠物网站:
用的是php代码
框架tp5
代码如下:
<?php namespace app\api\controller; use fast\Http;//引入这个类 use app\common\controller\Api; /** * 示例接口 */ class Demo extends Api { //如果$noNeedLogin为空表示所有接口都需要登录才能请求 //如果$noNeedRight为空表示所有接口都需要验证权限才能请求 //如果接口已经设置无需登录,那也就无需鉴权了 // // 无需登录的接口,*表示全部 protected $noNeedLogin = ['*']; // 无需鉴权的接口,*表示全部 protected $noNeedRight = ['*']; /** * 测试方法 * * @ApiTitle (测试名称) * @ApiSummary (测试描述信息) * @ApiMethod (POST) * @ApiRoute (/api/demo/test/id/{id}/name/{name}) * @ApiHeaders (name=token, type=string, required=true, description="请求的Token") * @ApiParams (name="id", type="integer", required=true, description="会员ID") * @ApiParams (name="name", type="string", required=true, description="用户名") * @ApiParams (name="data", type="object", sample="{'user_id':'int','user_name':'string','profile':{'email':'string','age':'integer'}}", description="扩展数据") * @ApiReturnParams (name="code", type="integer", required=true, sample="0") * @ApiReturnParams (name="msg", type="string", required=true, sample="返回成功") * @ApiReturnParams (name="data", type="object", sample="{'user_id':'int','user_name':'string','profile':{'email':'string','age':'integer'}}", description="扩展数据返回") * @ApiReturn ({ 'code':'1', 'msg':'返回成功' }) */ public function test() { $this->domain = 'http://www.boqii.com'; $type=['dog'=>'狗狗','cat'=>'猫咪','smallpet'=>'小宠','aquarium'=>'水族','reptile'=>'爬虫']; $now_path = getcwd();//目录 // halt(__FILE__); foreach ($type as $enname => $zhname) {//循环分类 $url = $this->domain.'/pet-all/'.$enname.'/'; echo $zhname; // $get = $Http->get($this->domain.'/pet-all/dog/'); $max_page = $this->pd_max_page($url); $url_page = $url.'?p='.$max_page; // 当前的宠物类别的目录 $cwpath = $now_path.'/cw/'.$zhname; $this->pd_dir($cwpath); for ($i=1; $i <=intval($max_page) ; $i++) { //循环页码 //循环下图片 //打开网页 获取源码 下载图片 $yuanma = $this->get($url.'?p='.$i); $img_arr = $this->re_img($yuanma); foreach($img_arr as $name=>$vurl){ // // exit(); $path = $cwpath.'/'.$name; // file_put_contents($path,$vurl); $this->download($vurl,$cwpath,$name.".png",$name); } } // dump($max_page); sleep(4); echo "<br>{$zhname}结束了"; } // $this->success('返回成功QQ496631085', $this->request->param()); } /** * 找最大页码 * */ public function pd_max_page($url) { // $Http= new Http(); // $yuanma = $Http->get($url); $yuanma = $this->get($url); //正则pet-all/dog/?p=2' $lastPattern ='/\?p\=([0-9]{1,2})/'; preg_match_all($lastPattern, $yuanma, $end);//找期数 $max = 1; foreach ($end[1] as $key => $value) { if(intval($value)>$max){ $max = $value; } } return $max; } /** * 正则图片地址 * */ public function re_img($str) {//<img alt="柯基犬" src="http://img.boqiicdn.com/Data/BK/P/imagick3061541499071.png" /> //<a target="_blank" href="http://www.boqii.com/entry/detail/425.html"><img alt="泰迪犬" src="http://img.boqiicdn.com/Data/BK/P/imagick441541497241.jpg" /></a> $re ='/\<a .*?href="(.*?)"\>\<img alt="(.*?)" src="(.*?)" \/\>\<\/a\>/'; preg_match_all($re, $str, $end);//找期数 if($end){ $all = array(); foreach($end[2] as $key => $zhname){ $all[$zhname] = $end[3][$key]; } return $all; }else{ return 0; } } //判断目录是否存在,不存在就创建 public function pd_dir($path){ if (!is_dir($path)){ //第三个参数是“true”表示能创建多级目录,iconv防止中文目录乱码 $res=mkdir(iconv("UTF-8", "GBK", $path),0777,true); if ($res){ echo "目录 $path 创建成功<br>"; }else{ echo "目录 $path 创建失败"; exit(); } } } //get获取源码 public function get($url){ $Http= new Http(); $yuanma = $Http->get($url); return $yuanma; } //下载图片的方法 ,待优化,随便拿取的一个, public function download($url, $save_dir = './cw/',$filename='',$name) { if(trim($save_dir)=='') $save_dir='./cw/'; if(trim($filename)==''){//保存文件名 $allowExt = array('gif', 'jpg', 'jpeg', 'png', 'bmp'); $ext=strrchr($url,'.'); if(!in_array($ext,$allowExt)) return array('file_name'=>'','save_path'=>'','error'=>3); $filename=time().$ext; } if(0!==strrpos($save_dir,'/')) $save_dir.='/'; //创建保存目录 if(!file_exists($save_dir)&&!mkdir($save_dir,0777,true)) return array('file_name'=>'','save_path'=>'','error'=>5); $ch = curl_init(); curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); // 信任任何证书 curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 30); $file = curl_exec($ch); curl_close($ch); $filename = pathinfo($url); $resource = fopen($save_dir . $name.'.'.$filename['extension'], 'a'); fwrite($resource, $file); fclose($resource); unset($file,$url); // return array('file_name'=>$filename,'save_path'=>$save_dir.$filename,'error'=>0); } }
一个功能块一个方法 ,这样后期方便调用 修改
有什么不懂的可以咨询!