PHP class function for screen scraping
I’ve updated my simple PHP function (the one to replace fopen()) for grabbing URLs using cURL. I’ve added some features and made it a class instead of a straight PHP function. One improvement is the ability to normalize URLs so you can use relative URLs. It also has more error checking and uses a standard user-agent by default.
The syntax is a little different from the previous version. To use it, you create an instance of the object then call the proper method:
-
$urlScoop = new UrlGrabber;
-
$rawhtml=$urlScoop->_get($urlScoop->_normalize("http://www.simmonsconsulting.com/"));
Fetching a relative url would look like this:
-
$urlScoop = new UrlGrabber;
-
$rawhtml=$urlScoop->_get($urlScoop->_normalize("../../Photos/"));
The function is included in the jump.
-
class UrlGrabber {
-
function _get($url) {
-
$this->content="";
-
$this->info="";
-
$ch = curl_init ();
-
curl_setopt ($ch, CURLOPT_RETURNTRANSFER, 1);
-
curl_setopt ($ch, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)");
-
curl_setopt ($ch, CURLOPT_URL, $url);
-
curl_setopt ($ch, CURLOPT_TIMEOUT, 60);
-
$this->content = curl_exec ($ch);
-
// Check for success, if anything weird happened append a note
-
$this->info = curl_getinfo($ch);
-
if ($this->content === false || $this->info['http_code'] != 200) {
-
if($this->content !== false )
-
$this->content .= "\n\n";
-
$this->content .= "HTTP status was abnormal for $url [". $this->info['http_code']. "]";
-
if (curl_error($ch))
-
$this->content .= "\n". curl_error($ch);
-
}
-
curl_close ($ch);
-
return($this->content);
-
}
-
-
function _post($url,$vars) {
-
$this->content="";
-
$ch = curl_init ();
-
curl_setopt ($ch, CURLOPT_URL, $url);
-
curl_setopt ($ch, CURLOPT_HEADER, 0);
-
curl_setopt ($ch, CURLOPT_RETURNTRANSFER, 1);
-
curl_setopt ($ch, CURLOPT_TIMEOUT, 60);
-
curl_setopt ($ch, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)");
-
curl_setopt ($ch, CURLOPT_POST, 1);
-
curl_setopt ($ch, CURLOPT_POSTFIELDS, $vars);
-
curl_setopt ($ch, CURLOPT_FOLLOWLOCATION, 1);
-
$this->content = curl_exec ($ch);
-
// Check for success, if anything weird happened append a note
-
$this->info = curl_getinfo($ch);
-
if ($this->content === false || $this->info['http_code'] != 200) {
-
if($this->content !== false )
-
$this->content .= "\n\n";
-
$this->content .= "HTTP status was abnormal for $url [". $this->info['http_code']. "]";
-
if (curl_error($ch))
-
$this->content .= "\n". curl_error($ch);
-
}
-
curl_close ($ch);
-
return($this->content);
-
}
-
-
function _normalize($url) {
-
// is the url absolute?
-
if(preg_match("/^https?:\/\//", $url)) {
-
return($url);
-
} // is the url locally absolute?
-
elseif(preg_match("/^\//", $url)) {
-
$url_root = $_SERVER["HTTPS"] == "on" ? "https://" : "http://";
-
$url_root .= isset($_SERVER['HTTP_HOST']) && strlen($_SERVER['HTTP_HOST'])>0 ? $_SERVER['HTTP_HOST'] : $_SERVER['SERVER_NAME'];
-
return($url_root . $url);
-
} // the url must be relative
-
else {
-
$slash = '/';
-
$url_root = $_SERVER["HTTPS"] == "on" ? "https://" : "http://";
-
$url_root .= $_SERVER['HTTP_HOST'];
-
$self = $_SERVER['PHP_SELF'];
-
$self = str_replace('\\', $slash, $self);
-
$phys_path = getcwd();
-
$phys_path = str_replace('\\', $slash, $phys_path);
-
$file_path_array = explode ($slash, $self);
-
$file_name = array_pop($file_path_array);
-
$cwd_path_array = explode ($slash, $phys_path);
-
$self_dir_path_array = array_values (array_intersect ($file_path_array, $cwd_path_array));
-
$self_dir_path = implode("/", $self_dir_path_array)."/";
-
$cwd_url = $url_root . '/' . $self_dir_path;
-
return($cwd_url . $url);
-
}
-
}
-
}