Simmons Consulting, the Website of Toby Simmons

PHP class function for screen scraping

10
Jan

I’ve updated my simple PHP function (the one to replace fopen()) for grabbing URLs using cURL. I’ve added some features and made it a class instead of a straight PHP function. One improvement is the ability to normalize URLs so you can use relative URLs. It also has more error checking and uses a standard user-agent by default.

The syntax is a little different from the previous version. To use it, you create an instance of the object then call the proper method:

  1. $urlScoop = new UrlGrabber;
  2. $rawhtml=$urlScoop->_get($urlScoop->_normalize("https://www.simmonsconsulting.com/"));

Fetching a relative url would look like this:

  1. $urlScoop = new UrlGrabber;
  2. $rawhtml=$urlScoop->_get($urlScoop->_normalize("../../Photos/"));

The function is included in the jump.

  1. class UrlGrabber {
  2.    function _get($url) {
  3.       $this->content="";
  4.       $this->info="";
  5.       $ch = curl_init ();
  6.       curl_setopt ($ch, CURLOPT_RETURNTRANSFER, 1);
  7.       curl_setopt ($ch, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)");
  8.       curl_setopt ($ch, CURLOPT_URL, $url);
  9.       curl_setopt ($ch, CURLOPT_TIMEOUT, 60);
  10.       $this->content = curl_exec ($ch);
  11.       // Check for success, if anything weird happened append a note
  12.       $this->info = curl_getinfo($ch);
  13.       if ($this->content === false || $this->info['http_code'] != 200) {
  14.          if($this->content !== false )
  15.             $this->content .= "\n\n";
  16.          $this->content .= "HTTP status was abnormal for $url [". $this->info['http_code']. "]";
  17.          if (curl_error($ch))
  18.             $this->content .= "\n". curl_error($ch);
  19.       }
  20.       curl_close ($ch);
  21.       return($this->content);
  22.    }
  23.  
  24.    function _post($url,$vars) {
  25.       $this->content="";
  26.       $ch = curl_init ();
  27.       curl_setopt ($ch, CURLOPT_URL, $url);
  28.       curl_setopt ($ch, CURLOPT_HEADER, 0);
  29.       curl_setopt ($ch, CURLOPT_RETURNTRANSFER, 1);
  30.       curl_setopt ($ch, CURLOPT_TIMEOUT, 60);
  31.       curl_setopt ($ch, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)");
  32.       curl_setopt ($ch, CURLOPT_POST, 1);
  33.       curl_setopt ($ch, CURLOPT_POSTFIELDS, $vars);
  34.       curl_setopt ($ch, CURLOPT_FOLLOWLOCATION, 1);
  35.       $this->content = curl_exec ($ch);
  36.       // Check for success, if anything weird happened append a note
  37.       $this->info = curl_getinfo($ch);
  38.       if ($this->content === false || $this->info['http_code'] != 200) {
  39.          if($this->content !== false )
  40.             $this->content .= "\n\n";
  41.          $this->content .= "HTTP status was abnormal for $url [". $this->info['http_code']. "]";
  42.          if (curl_error($ch))
  43.             $this->content .= "\n". curl_error($ch);
  44.       }
  45.       curl_close ($ch);
  46.       return($this->content);
  47.    }
  48.  
  49.    function _normalize($url) {
  50.       // is the url absolute?
  51.       if(preg_match("/^https?:\/\//", $url)) {
  52.          return($url);
  53.       } // is the url locally absolute?
  54.       elseif(preg_match("/^\//", $url)) {
  55.          $url_root = $_SERVER["HTTPS"] == "on" ? "https://" : "http://";
  56.          $url_root .= isset($_SERVER['HTTP_HOST']) && strlen($_SERVER['HTTP_HOST'])>0 ? $_SERVER['HTTP_HOST'] : $_SERVER['SERVER_NAME'];
  57.          return($url_root . $url);
  58.       } // the url must be relative
  59.       else {
  60.          $slash = '/';
  61.          $url_root = $_SERVER["HTTPS"] == "on" ? "https://" : "http://";
  62.          $url_root .= $_SERVER['HTTP_HOST'];
  63.          $self = $_SERVER['PHP_SELF'];
  64.          $self = str_replace('\\', $slash, $self);
  65.          $phys_path = getcwd();
  66.          $phys_path = str_replace('\\', $slash, $phys_path);
  67.          $file_path_array = explode ($slash, $self);
  68.          $file_name = array_pop($file_path_array);
  69.          $cwd_path_array = explode ($slash, $phys_path);
  70.          $self_dir_path_array = array_values (array_intersect ($file_path_array, $cwd_path_array));
  71.          $self_dir_path = implode("/", $self_dir_path_array)."/";
  72.          $cwd_url = $url_root . '/' . $self_dir_path;
  73.          return($cwd_url . $url);
  74.       }
  75.    }
  76. }

Leave a Reply

XHTML: You can use these tags: <a href="" title=""> <abbr title=""> <acronym title=""> <b> <blockquote cite=""> <cite> <code> <del datetime=""> <em> <i> <q cite=""> <s> <strike> <strong>