
现在越来越多的站长公司注重网站的安全了,网站纷纷采用HTTPS加密协议通讯,让网站更加的安全的同时,让许多开发者头疼,采集网站有的数据采集不到了,现在给Phper一个现成的采集抓取类,下面是对应的代码及示例。(代码不是原创,拿来主义)
<?php
/**
* Created by PhpStorm.
* User: Yt
* Date: 2016/4/6 0006
*/
function ihttp_request($url, $post = '', $extra = array(), $timeout = 60) {
$urlset = parse_url($url);
if (empty($urlset['path'])) {
$urlset['path'] = '/';
}
if (!empty($urlset['query'])) {
$urlset['query'] = "?{$urlset['query']}";
}
if (empty($urlset['port'])) {
$urlset['port'] = $urlset['scheme'] == 'https' ? '443' : '80';
}
if (strexists($url, 'https://') && !extension_loaded('openssl')) {
if (!extension_loaded("openssl")) {
message('请开启您PHP环境的openssl');
}
}
if (function_exists('curl_init') && function_exists('curl_exec')) {
$ch = curl_init();
if (ver_compare(phpversion(), '5.6') >= 0) {
curl_setopt($ch, CURLOPT_SAFE_UPLOAD, false);
}
if (!empty($extra['ip'])) {
$extra['Host'] = $urlset['host'];
$urlset['host'] = $extra['ip'];
unset($extra['ip']);
}
curl_setopt($ch, CURLOPT_URL, $urlset['scheme'] . '://' . $urlset['host'] . ($urlset['port'] == '80' ? '' : ':' . $urlset['port']) . $urlset['path'] . $urlset['query']);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
@curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
curl_setopt($ch, CURLOPT_HEADER, 1);
@curl_setopt($ch, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_1_0);
if ($post) {
if (is_array($post)) {
$filepost = false;
foreach ($post as $name => $value) {
if ((is_string($value) && substr($value, 0, 1) == '@') || (class_exists('CURLFile') && $value instanceof CURLFile)) {
$filepost = true;
break;
}
}
if (!$filepost) {
$post = http_build_query($post);
}
}
curl_setopt($ch, CURLOPT_POST, 1);
curl_setopt($ch, CURLOPT_POSTFIELDS, $post);
}
if (!empty($GLOBALS['_W']['config']['setting']['proxy'])) {
$urls = parse_url($GLOBALS['_W']['config']['setting']['proxy']['host']);
if (!empty($urls['host'])) {
curl_setopt($ch, CURLOPT_PROXY, "{$urls['host']}:{$urls['port']}");
$proxytype = 'CURLPROXY_' . strtoupper($urls['scheme']);
if (!empty($urls['scheme']) && defined($proxytype)) {
curl_setopt($ch, CURLOPT_PROXYTYPE, constant($proxytype));
} else {
curl_setopt($ch, CURLOPT_PROXYTYPE, CURLPROXY_HTTP);
curl_setopt($ch, CURLOPT_HTTPPROXYTUNNEL, 1);
}
if (!empty($GLOBALS['_W']['config']['setting']['proxy']['auth'])) {
curl_setopt($ch, CURLOPT_PROXYUSERPWD, $GLOBALS['_W']['config']['setting']['proxy']['auth']);
}
}
}
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout);
curl_setopt($ch, CURLOPT_TIMEOUT, $timeout);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, 0);
curl_setopt($ch, CURLOPT_SSLVERSION, 1);
if (defined('CURL_SSLVERSION_TLSv1')) {
curl_setopt($ch, CURLOPT_SSLVERSION, CURL_SSLVERSION_TLSv1);
}
curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:9.0.1) Gecko/20100101 Firefox/9.0.1');
if (!empty($extra) && is_array($extra)) {
$headers = array();
foreach ($extra as $opt => $value) {
if (strexists($opt, 'CURLOPT_')) {
curl_setopt($ch, constant($opt), $value);
} elseif (is_numeric($opt)) {
curl_setopt($ch, $opt, $value);
} else {
$headers[] = "{$opt}: {$value}";
}
}
if (!empty($headers)) {
curl_setopt($ch, CURLOPT_HTTPHEADER, $headers);
}
}
$data = curl_exec($ch);
$status = curl_getinfo($ch);
$errno = curl_errno($ch);
$error = curl_error($ch);
curl_close($ch);
if ($errno || empty($data)) {
return error(1, $error);
} else {
return ihttp_response_parse($data);
}
}
$method = empty($post) ? 'GET' : 'POST';
$fdata = "{$method} {$urlset['path']}{$urlset['query']} HTTP/1.1\r\n";
$fdata .= "Host: {$urlset['host']}\r\n";
if (function_exists('gzdecode')) {
$fdata .= "Accept-Encoding: gzip, deflate\r\n";
}
$fdata .= "Connection: close\r\n";
if (!empty($extra) && is_array($extra)) {
foreach ($extra as $opt => $value) {
if (!strexists($opt, 'CURLOPT_')) {
$fdata .= "{$opt}: {$value}\r\n";
}
}
}
$body = '';
if ($post) {
if (is_array($post)) {
$body = http_build_query($post);
} else {
$body = urlencode($post);
}
$fdata .= 'Content-Length: ' . strlen($body) . "\r\n\r\n{$body}";
} else {
$fdata .= "\r\n";
}
if ($urlset['scheme'] == 'https') {
$fp = fsockopen('ssl://' . $urlset['host'], $urlset['port'], $errno, $error);
} else {
$fp = fsockopen($urlset['host'], $urlset['port'], $errno, $error);
}
stream_set_blocking($fp, true);
stream_set_timeout($fp, $timeout);
if (!$fp) {
return error(1, $error);
} else {
fwrite($fp, $fdata);
$content = '';
while (!feof($fp))
$content .= fgets($fp, 512);
fclose($fp);
return ihttp_response_parse($content, true);
}
}
function ihttp_response_parse($data, $chunked = false) {
$rlt = array();
$headermeta = explode('HTTP/', $data);
if (count($headermeta) > 2) {
$data = 'HTTP/' . array_pop($headermeta);
}
$pos = strpos($data, "\r\n\r\n");
$split1[0] = substr($data, 0, $pos);
$split1[1] = substr($data, $pos + 4, strlen($data));
$split2 = explode("\r\n", $split1[0], 2);
preg_match('/^(\S+) (\S+) (\S+)$/', $split2[0], $matches);
$rlt['code'] = $matches[2];
$rlt['status'] = $matches[3];
$rlt['responseline'] = $split2[0];
$header = explode("\r\n", $split2[1]);
$isgzip = false;
$ischunk = false;
foreach ($header as $v) {
$pos = strpos($v, ':');
$key = substr($v, 0, $pos);
$value = trim(substr($v, $pos + 1));
if (is_array($rlt['headers'][$key])) {
$rlt['headers'][$key][] = $value;
} elseif (!empty($rlt['headers'][$key])) {
$temp = $rlt['headers'][$key];
unset($rlt['headers'][$key]);
$rlt['headers'][$key][] = $temp;
$rlt['headers'][$key][] = $value;
} else {
$rlt['headers'][$key] = $value;
}
if(!$isgzip && strtolower($key) == 'content-encoding' && strtolower($value) == 'gzip') {
$isgzip = true;
}
if(!$ischunk && strtolower($key) == 'transfer-encoding' && strtolower($value) == 'chunked') {
$ischunk = true;
}
}
if($chunked && $ischunk) {
$rlt['content'] = ihttp_response_parse_unchunk($split1[1]);
} else {
$rlt['content'] = $split1[1];
}
if($isgzip && function_exists('gzdecode')) {
$rlt['content'] = gzdecode($rlt['content']);
}
$rlt['meta'] = $data;
if($rlt['code'] == '100') {
return ihttp_response_parse($rlt['content']);
}
return $rlt;
}
function ihttp_response_parse_unchunk($str = null) {
if(!is_string($str) or strlen($str) < 1) {
return false;
}
$eol = "\r\n";
$add = strlen($eol);
$tmp = $str;
$str = '';
do {
$tmp = ltrim($tmp);
$pos = strpos($tmp, $eol);
if($pos === false) {
return false;
}
$len = hexdec(substr($tmp, 0, $pos));
if(!is_numeric($len) or $len < 0) {
return false;
}
$str .= substr($tmp, ($pos + $add), $len);
$tmp = substr($tmp, ($len + $pos + $add));
$check = trim($tmp);
} while(!empty($check));
unset($tmp);
return $str;
}
function ihttp_get($url) {
return ihttp_request($url);
}
function ihttp_post($url, $data) {
$headers = array('Content-Type' => 'application/x-www-form-urlencoded');
return ihttp_request($url, $data, $headers);
}
function strexists($string, $find) {
return !(strpos($string, $find) === FALSE);
}
function ver_compare($version1, $version2) {
$version1 = str_replace('.', '', $version1);
$version2 = str_replace('.', '', $version2);
$oldLength = istrlen($version1);
$newLength = istrlen($version2);
if ($oldLength > $newLength) {
$version2 .= str_repeat('0', $oldLength - $newLength);
}
if ($newLength > $oldLength) {
$version1 .= str_repeat('0', $newLength - $oldLength);
}
$version1 = intval($version1);
$version2 = intval($version2);
return version_compare($version1, $version2);
}
function istrlen($string, $charset = '') {
global $_W;
if (empty($charset)) {
$charset = $_W['charset'];
}
if (strtolower($charset) == 'gbk') {
$charset = 'gbk';
} else {
$charset = 'utf8';
}
if (function_exists('mb_strlen')) {
return mb_strlen($string, $charset);
} else {
$n = $noc = 0;
$strlen = strlen($string);
if ($charset == 'utf8') {
while ($n < $strlen) {
$t = ord($string[$n]);
if ($t == 9 || $t == 10 || (32 <= $t && $t <= 126)) {
$n++;
$noc++;
} elseif (194 <= $t && $t <= 223) {
$n += 2;
$noc++;
} elseif (224 <= $t && $t <= 239) {
$n += 3;
$noc++;
} elseif (240 <= $t && $t <= 247) {
$n += 4;
$noc++;
} elseif (248 <= $t && $t <= 251) {
$n += 5;
$noc++;
} elseif ($t == 252 || $t == 253) {
$n += 6;
$noc++;
} else {
$n++;
}
}
} else {
while ($n < $strlen) {
$t = ord($string[$n]);
if ($t > 127) {
$n += 2;
$noc++;
} else {
$n++;
$noc++;
}
}
}
return $noc;
}
}
//使用示例
$url = "http://search.jd.com";
$75271com = ihttp_request($url);
echo ($75271com['content']);
