• 欢迎访问开心洋葱网站,在线教程,推荐使用最新版火狐浏览器和Chrome浏览器访问本网站,欢迎加入开心洋葱 QQ群
  • 为方便开心洋葱网用户,开心洋葱官网已经开启复制功能!
  • 欢迎访问开心洋葱网站,手机也能访问哦~欢迎加入开心洋葱多维思维学习平台 QQ群
  • 如果您觉得本站非常有看点,那么赶紧使用Ctrl+D 收藏开心洋葱吧~~~~~~~~~~~~~!
  • 由于近期流量激增,小站的ECS没能经的起亲们的访问,本站依然没有盈利,如果各位看如果觉着文字不错,还请看官给小站打个赏~~~~~~~~~~~~~!

PHP https协议下的网页抓取采集

PHP 开心洋葱 1364次浏览 0个评论

PHP https协议下的网页抓取采集
现在越来越多的站长公司注重网站的安全了,网站纷纷采用HTTPS加密协议通讯,让网站更加的安全的同时,让许多开发者头疼,采集网站有的数据采集不到了,现在给Phper一个现成的采集抓取类,下面是对应的代码及示例。(代码不是原创,拿来主义)

<?php
/**
 * Created by PhpStorm.
 * User: Yt
 * Date: 2016/4/6 0006
*/

function ihttp_request($url, $post = '', $extra = array(), $timeout = 60) {
   $urlset = parse_url($url);
   if (empty($urlset['path'])) {
      $urlset['path'] = '/';
   }
   if (!empty($urlset['query'])) {
      $urlset['query'] = "?{$urlset['query']}";
   }
   if (empty($urlset['port'])) {
      $urlset['port'] = $urlset['scheme'] == 'https' ? '443' : '80';
   }
   if (strexists($url, 'https://') && !extension_loaded('openssl')) {
      if (!extension_loaded("openssl")) {
         message('请开启您PHP环境的openssl');
      }
   }
   if (function_exists('curl_init') && function_exists('curl_exec')) {
      $ch = curl_init();
      if (ver_compare(phpversion(), '5.6') >= 0) {
         curl_setopt($ch, CURLOPT_SAFE_UPLOAD, false);
      }
      if (!empty($extra['ip'])) {
         $extra['Host'] = $urlset['host'];
         $urlset['host'] = $extra['ip'];
         unset($extra['ip']);
      }
      curl_setopt($ch, CURLOPT_URL, $urlset['scheme'] . '://' . $urlset['host'] . ($urlset['port'] == '80' ? '' : ':' . $urlset['port']) . $urlset['path'] . $urlset['query']);
      curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
      @curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
      curl_setopt($ch, CURLOPT_HEADER, 1);
      @curl_setopt($ch, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_1_0);
      if ($post) {
         if (is_array($post)) {
            $filepost = false;
            foreach ($post as $name => $value) {
               if ((is_string($value) && substr($value, 0, 1) == '@') || (class_exists('CURLFile') && $value instanceof CURLFile)) {
                  $filepost = true;
                  break;
               }
            }
            if (!$filepost) {
               $post = http_build_query($post);
            }
         }
         curl_setopt($ch, CURLOPT_POST, 1);
         curl_setopt($ch, CURLOPT_POSTFIELDS, $post);
      }
      if (!empty($GLOBALS['_W']['config']['setting']['proxy'])) {
         $urls = parse_url($GLOBALS['_W']['config']['setting']['proxy']['host']);
         if (!empty($urls['host'])) {
            curl_setopt($ch, CURLOPT_PROXY, "{$urls['host']}:{$urls['port']}");
            $proxytype = 'CURLPROXY_' . strtoupper($urls['scheme']);
            if (!empty($urls['scheme']) && defined($proxytype)) {
               curl_setopt($ch, CURLOPT_PROXYTYPE, constant($proxytype));
            } else {
               curl_setopt($ch, CURLOPT_PROXYTYPE, CURLPROXY_HTTP);
               curl_setopt($ch, CURLOPT_HTTPPROXYTUNNEL, 1);
            }
            if (!empty($GLOBALS['_W']['config']['setting']['proxy']['auth'])) {
               curl_setopt($ch, CURLOPT_PROXYUSERPWD, $GLOBALS['_W']['config']['setting']['proxy']['auth']);
            }
         }
      }
      curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout);
      curl_setopt($ch, CURLOPT_TIMEOUT, $timeout);
      curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
      curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, 0);
      curl_setopt($ch, CURLOPT_SSLVERSION, 1);
      if (defined('CURL_SSLVERSION_TLSv1')) {
         curl_setopt($ch, CURLOPT_SSLVERSION, CURL_SSLVERSION_TLSv1);
      }
      curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:9.0.1) Gecko/20100101 Firefox/9.0.1');
      if (!empty($extra) && is_array($extra)) {
         $headers = array();
         foreach ($extra as $opt => $value) {
            if (strexists($opt, 'CURLOPT_')) {
               curl_setopt($ch, constant($opt), $value);
            } elseif (is_numeric($opt)) {
               curl_setopt($ch, $opt, $value);
            } else {
               $headers[] = "{$opt}: {$value}";
            }
         }
         if (!empty($headers)) {
            curl_setopt($ch, CURLOPT_HTTPHEADER, $headers);
         }
      }
      $data = curl_exec($ch);
      $status = curl_getinfo($ch);
      $errno = curl_errno($ch);
      $error = curl_error($ch);
      curl_close($ch);
      if ($errno || empty($data)) {
         return error(1, $error);
      } else {
         return ihttp_response_parse($data);
      }
   }
   $method = empty($post) ? 'GET' : 'POST';
   $fdata = "{$method} {$urlset['path']}{$urlset['query']} HTTP/1.1\r\n";
   $fdata .= "Host: {$urlset['host']}\r\n";
   if (function_exists('gzdecode')) {
      $fdata .= "Accept-Encoding: gzip, deflate\r\n";
   }
   $fdata .= "Connection: close\r\n";
   if (!empty($extra) && is_array($extra)) {
      foreach ($extra as $opt => $value) {
         if (!strexists($opt, 'CURLOPT_')) {
            $fdata .= "{$opt}: {$value}\r\n";
         }
      }
   }
   $body = '';
   if ($post) {
      if (is_array($post)) {
         $body = http_build_query($post);
      } else {
         $body = urlencode($post);
      }
      $fdata .= 'Content-Length: ' . strlen($body) . "\r\n\r\n{$body}";
   } else {
      $fdata .= "\r\n";
   }
   if ($urlset['scheme'] == 'https') {
      $fp = fsockopen('ssl://' . $urlset['host'], $urlset['port'], $errno, $error);
   } else {
      $fp = fsockopen($urlset['host'], $urlset['port'], $errno, $error);
   }
   stream_set_blocking($fp, true);
   stream_set_timeout($fp, $timeout);
   if (!$fp) {
      return error(1, $error);
   } else {
      fwrite($fp, $fdata);
      $content = '';
      while (!feof($fp))
         $content .= fgets($fp, 512);
      fclose($fp);
      return ihttp_response_parse($content, true);
   }
}


function ihttp_response_parse($data, $chunked = false) {
   $rlt = array();
   $headermeta = explode('HTTP/', $data);
   if (count($headermeta) > 2) {
      $data = 'HTTP/' . array_pop($headermeta);
   }
   $pos = strpos($data, "\r\n\r\n");
   $split1[0] = substr($data, 0, $pos);
   $split1[1] = substr($data, $pos + 4, strlen($data));

   $split2 = explode("\r\n", $split1[0], 2);
   preg_match('/^(\S+) (\S+) (\S+)$/', $split2[0], $matches);
   $rlt['code'] = $matches[2];
   $rlt['status'] = $matches[3];
   $rlt['responseline'] = $split2[0];
   $header = explode("\r\n", $split2[1]);
   $isgzip = false;
   $ischunk = false;
   foreach ($header as $v) {
      $pos = strpos($v, ':');
      $key = substr($v, 0, $pos);
      $value = trim(substr($v, $pos + 1));
      if (is_array($rlt['headers'][$key])) {
         $rlt['headers'][$key][] = $value;
      } elseif (!empty($rlt['headers'][$key])) {
         $temp = $rlt['headers'][$key];
         unset($rlt['headers'][$key]);
         $rlt['headers'][$key][] = $temp;
         $rlt['headers'][$key][] = $value;
      } else {
         $rlt['headers'][$key] = $value;
      }
      if(!$isgzip && strtolower($key) == 'content-encoding' && strtolower($value) == 'gzip') {
         $isgzip = true;
      }
      if(!$ischunk && strtolower($key) == 'transfer-encoding' && strtolower($value) == 'chunked') {
         $ischunk = true;
      }
   }
   if($chunked && $ischunk) {
      $rlt['content'] = ihttp_response_parse_unchunk($split1[1]);
   } else {
      $rlt['content'] = $split1[1];
   }
   if($isgzip && function_exists('gzdecode')) {
      $rlt['content'] = gzdecode($rlt['content']);
   }

   $rlt['meta'] = $data;
   if($rlt['code'] == '100') {
      return ihttp_response_parse($rlt['content']);
   }
   return $rlt;
}

function ihttp_response_parse_unchunk($str = null) {
   if(!is_string($str) or strlen($str) < 1) {
      return false;
   }
   $eol = "\r\n";
   $add = strlen($eol);
   $tmp = $str;
   $str = '';
   do {
      $tmp = ltrim($tmp);
      $pos = strpos($tmp, $eol);
      if($pos === false) {
         return false;
      }
      $len = hexdec(substr($tmp, 0, $pos));
      if(!is_numeric($len) or $len < 0) {
         return false;
      }
      $str .= substr($tmp, ($pos + $add), $len);
      $tmp  = substr($tmp, ($len + $pos + $add));
      $check = trim($tmp);
   } while(!empty($check));
   unset($tmp);
   return $str;
}


function ihttp_get($url) {
   return ihttp_request($url);
}


function ihttp_post($url, $data) {
   $headers = array('Content-Type' => 'application/x-www-form-urlencoded');
   return ihttp_request($url, $data, $headers);
}



function strexists($string, $find) {
   return !(strpos($string, $find) === FALSE);
}


function ver_compare($version1, $version2) {
   $version1 = str_replace('.', '', $version1);
   $version2 = str_replace('.', '', $version2);
   $oldLength = istrlen($version1);
   $newLength = istrlen($version2);
   if ($oldLength > $newLength) {
      $version2 .= str_repeat('0', $oldLength - $newLength);
   }
   if ($newLength > $oldLength) {
      $version1 .= str_repeat('0', $newLength - $oldLength);
   }
   $version1 = intval($version1);
   $version2 = intval($version2);
   return version_compare($version1, $version2);
}


function istrlen($string, $charset = '') {
   global $_W;
   if (empty($charset)) {
      $charset = $_W['charset'];
   }
   if (strtolower($charset) == 'gbk') {
      $charset = 'gbk';
   } else {
      $charset = 'utf8';
   }
   if (function_exists('mb_strlen')) {
      return mb_strlen($string, $charset);
   } else {
      $n = $noc = 0;
      $strlen = strlen($string);

      if ($charset == 'utf8') {

         while ($n < $strlen) {
            $t = ord($string[$n]);
            if ($t == 9 || $t == 10 || (32 <= $t && $t <= 126)) {
               $n++;
               $noc++;
            } elseif (194 <= $t && $t <= 223) {
               $n += 2;
               $noc++;
            } elseif (224 <= $t && $t <= 239) {
               $n += 3;
               $noc++;
            } elseif (240 <= $t && $t <= 247) {
               $n += 4;
               $noc++;
            } elseif (248 <= $t && $t <= 251) {
               $n += 5;
               $noc++;
            } elseif ($t == 252 || $t == 253) {
               $n += 6;
               $noc++;
            } else {
               $n++;
            }
         }

      } else {

         while ($n < $strlen) {
            $t = ord($string[$n]);
            if ($t > 127) {
               $n += 2;
               $noc++;
            } else {
               $n++;
               $noc++;
            }
         }

      }

      return $noc;
   }
}

//使用示例
$url = "http://search.jd.com";
$75271com = ihttp_request($url);
echo ($75271com['content']);


开心洋葱 , 版权所有丨如未注明 , 均为原创丨未经授权请勿修改 , 转载请注明PHP https协议下的网页抓取采集
喜欢 (1)

您必须 登录 才能发表评论!

加载中……