PHP可以灵活配置使用的采集器

相关推荐

PHP可以灵活配置使用的采集器

　　PHP可以灵活配置使用的采集器？就跟随百分网小编一起去了解下吧，想了解更多相关信息请持续关注我们应届毕业生考试网!

　　代码：

　　<?php

　　/**

　　* 可以灵活配置使用的采集器

　　* 作者：Rain

　　* 创建时间：2015-02-03 15:17:30

　　* 版本信息：V1.0

　　///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

　　//数据库的相关配置信息,请根据您的数据库信息进行配置

　　define('DB_HOST', 'localhost');

　　define('DB_USER', 'root');

　　define('DB_PWD', 'test123456');

　　define('DB_NAME', 'test_dbname');

　　define('DB_CHARSET', 'utf8');

　　define('TABLE_NAME', 'tb_book');

　　//end

　　//网站信息相关的配置,请根据具体需要采集的网站内容信息进行配置

　　define('WEB_CHARSET', 'gbk');

　　//变动的参数，使用%d进行替换,只支持数值形式的变动

　　define('WEB_LIST_URL', 'http://www.pcbookcn.com/book/1_%d.htm');

　　//分页的条数

　　define('PAGE_COUNT', 14);

　　//从哪个页面开始抓取

　　define('PAGE_START', 1);

　　//内容页的URL,使用正则模式,必须包含/,例如：/\/xuefu2008\/article\/details\/(\d)+/i

　　define('WEB_CONTENT_URL_REG', '/\/book\/(\d)+\.htm/i');

　　//网站域名HOST信息,不包含末尾的/，例如：http://blog.csdn.net

　　define('WEB_HOST', 'http://www.pcbookcn.com');

　　//列表页内容的精准定位，用来大致抓取一个列表页的内容显示模块位置，使用正则进行定位

　　define('WEB_LIST_POSTION', '/book_name\.gif(.*?)<td\swidth="15\%"\snowrap>/i');

　　//end

　　//微调参数，通常不修改也不会影响您的正常使用

　　define('SLEEP_TIME', 1);

　　define('IS_DEBUG', false);

　　define('INSERT_DB', true);

　　//内容的输出速度，单位：秒

　　define('OUTPUT_SPEED', 1);

　　//end

　　//需要过滤删除的文字,根据采集的网站类型进行设置,不区分大小写

　　$text_filter = array(

　　'- 中华电脑书库' => '',

　　'_电脑电子书' => '',

　　'_电脑书籍' => '',

　　'下载' => '',

　　);

　　//表结构映射的配置

　　$table_mapping = array(

　　//表字段名称 => 获取该字段的正则表达式,非空字段都必须在此设置映射关系，常量值请直接填写具体对应的值，无需使用正则

　　'size' => '/软件大小.*?000000>(.*?)<\/font>/i',

　　'logo' => 'http://www.94cto.com/index/uploads/images/20150105/0b8461910de101cc51a07684cdab797e.jpg',

　　'field1' => '/<title>(.*?)<\/title>/i',

　　'field2' => '/软件简介.*?000000>(.*?)<\/font>/i',

　　'field3' => '1',

　　'field4' => '1',

　　'field5' => '1',

　　'field6' => '电子书,计算机,图像,图形',

　　'platform' => 'window/Linux',

　　'ishot' => '1',

　　'agreement' => '免费',

　　'downurl' => '/(\/down\.asp\?id=.*?)"/i',

　　'istop' => '1',

　　);

　　///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

　　$ga = new Gather();

　　$ga->run();

　　class Gather

　　{

　　public function __construct()

　　{

　　$this->init_check();

　　}

　　public function run()

　　{

　　global $table_mapping, $text_filter;

　　for ($page = PAGE_START; $page <= PAGE_COUNT; $page++)

　　{

　　$this->write('开始采集列表第'.$page.'页的内容...');

　　$list_content = $this->get(sprintf(WEB_LIST_URL, $page));

　　if (empty($list_content))

　　{

　　$this->write('抓取的列表页的内容为空，所以过滤掉');

　　continue;

　　}

　　$list_content = str_replace("\r", '', $list_content);

　　$list_content = str_replace("\n", '', $list_content);

　　//精准定位要抓取的模块内容

　　if (!preg_match(WEB_LIST_POSTION, $list_content, $list_search))

　　{

　　$this->write('精准匹配列表页的内容失败，所以过滤掉');

　　continue;

　　}

　　if (isset($list_search[1]))

　　$list_content = $list_search[1];

　　else

　　$list_content = $list_search[0];

　　//end

　　preg_match_all(WEB_CONTENT_URL_REG, $list_content, $match);

　　if (is_array($match[0]) && !empty($match[0]))

　　{

　　$this->write('当前的列表页面，总共匹配到：'.count($match[0]).'个内容页');

　　foreach ($match[0] as $val)

　　{

　　if (strpos($val, 'http:') === false)

　　{

　　if (substr($val, 0, 1) == '/')

　　$val = WEB_HOST.$val;

　　else

　　$val = WEB_HOST.'/'.$val;

　　}

　　$web_content = $this->get($val);

　　if (empty($web_content))

　　{

　　$this->write('抓取的内容页为空,所以过滤掉');

　　continue;

　　}

　　$web_content = str_replace("\r", '', $web_content);

　　$web_content = str_replace("\n", '【】', $web_content);

　　$sql = "INSERT INTO ".TABLE_NAME."(".implode(', ', array_keys($table_mapping)).")VALUES(";

　　foreach ($table_mapping as $field => $reg)

　　$sql .= ':'.$field.',';

　　$sql = substr($sql ,0, -1);

　　$sql .= ')';

　　if (IS_DEBUG)

　　$this->write('执行SQL '.$sql);

　　$dsn = 'mysql:dbname='.DB_NAME.';host='.DB_HOST;

　　try {

　　$dbh = new PDO($dsn, DB_USER, DB_PWD);

　　} catch (PDOException $e) {

　　$this->write( 'Connection failed: ' . $e->getMessage(), true);

　　}

　　$dbh->query("set names 'utf8'");

　　$sth = $dbh->prepare($sql);

　　foreach ($table_mapping as $field => $reg)

　　{

　　if (substr($reg, 0, 1) != '/')

　　{

　　$$field = $reg;

　　}

　　else

　　{

　　if (!preg_match($reg, $web_content, $tmp_match))

　　{

　　$this->write('对不起,匹配字段：'.$field.'失败，过滤此记录');

　　continue 2;

　　}

　　$$field = $tmp_match[1];

　　$$field = $this->closetags($$field);

　　//删除javascript脚本

　　$$field = preg_replace('/<script(.*?)>(.*?)<\/script>/i', '', $$field);

　　//将链接删除

　　$$field = preg_replace('/<a(.*?)>(.*?)<\/a>/i', '${2}', $$field);

　　//图片链接地址绝对地址化

　　preg_match_all('/<img.*?src=("|\')+(.*?)("|\')+.*?>/i', $$field, $img_match);

　　if (isset($img_match[2]) && is_array($img_match[2]) && !empty($img_match[2]))

　　{

　　foreach ($img_match[2] as $img_val)

　　{

　　if (strpos($img_val, 'http:') === false)

　　{

　　$new_val = $img_val;

　　if (substr($new_val, 0, 1) != '/')

　　$new_val = '/'.$img_val;

　　$new_val = WEB_HOST.$new_val;

　　$$field = str_replace($img_val, $new_val, $$field);

　　}

　　//end

　　//针对HTML里面的pre的换行先做一个特殊处理

　　$$field = preg_replace('/<pre.*?>(.*?)<\/pre>/i', '<pre class="prettyprint">${1}</pre>', $$field);

　　preg_match_all('/<pre>(.*?)<\/pre>/i', $$field, $pre_match);

　　if (isset($pre_match[1]) && is_array($pre_match[1]) && !empty($pre_match[1]))

　　{

　　foreach ($pre_match[1] as $pre_val)

　　$$field = str_replace($pre_val, str_replace("【】", "\r\n", $pre_val), $$field);

　　}

　　//end

　　}

　　//入库之前，将对应的换行符号都还原回来

　　$$field = str_replace('【】', "\r\n", $$field);

　　//文本的过滤和替换操作

　　if (is_array($text_filter) && !empty($text_filter))

　　{

　　foreach ($text_filter as $tk => $tv)

　　$$field = str_ireplace($tk, $tv, $$field);

　　}

　　if (IS_DEBUG)

　　$this->write('*'."\t".'字段：'.$field.' 值：'."\n****************************************************\n".$$field."\n****************************************************");

　　if ('downurl' == $field && stripos($$field, 'http:') === false)

　　if (substr($$field, 0, 1) == '/')

　　$$field = WEB_HOST.trim($$field);

　　else

　　$$field = WEB_HOST.'/'.trim($$field);

　　$sth->bindValue(':'.$field, trim($$field));

　　}

　　if (INSERT_DB)

　　$sth->execute();

　　$sth->closeCursor();

　　$this->write( '休息，暂停'.SLEEP_TIME.'秒后继续抓取...');

　　sleep(SLEEP_TIME);

　　}

　　else

　　{

　　$this->write('列表页面没有抓取到内容，所以过滤掉');

　　}

　　$this->write('', true);

　　}

　　protected function closetags($html)

　　{

　　// 不需要补全的标签

　　$arr_single_tags = array('meta', 'img', 'br', 'link', 'area');

　　// 匹配开始标签

　　preg_match_all('#<([a-z]+)(?: .*)?(?<![/|/ ])>#iU', $html, $result);

　　$openedtags = $result[1];

　　// 匹配关闭标签

　　preg_match_all('#</([a-z]+)>#iU', $html, $result);

　　$closedtags = $result[1];

　　// 计算关闭开启标签数量，如果相同就返回html数据

　　$len_opened = count($openedtags);

　　if (count($closedtags) == $len_opened) {

　　return $html;

　　}

　　// 把排序数组，将最后一个开启的标签放在最前面

　　$openedtags = array_reverse($openedtags);

　　// 遍历开启标签数组

　　for ($i = 0; $i < $len_opened; $i++) {

　　// 如果需要补全的标签

　　if (!in_array($openedtags[$i], $arr_single_tags)) {

　　// 如果这个标签不在关闭的标签中

　　if (!in_array($openedtags[$i], $closedtags)) {

　　// 直接补全闭合标签

　　$html .= '</' . $openedtags[$i] . '>';

　　} else {

　　unset($closedtags[array_search($openedtags[$i], $closedtags)]);

　　}

　　return $html;

　　}

　　protected function init_check()

　　{

　　if (!$this->check_curl_support())

　　$this->write('对不起，请先开启CURL的类库的支持，否则无法执行', true);

　　$this->check_mysql_connect();

　　$this->write('程序初始化检查通过,执行后续的流程...');

　　}

　　private function get($url, $data = array())

　　{

　　$this->write('开始执行抓取: '.$url);

　　$ch = curl_init();

　　curl_setopt($ch, CURLOPT_URL, $url);

　　//curl_setopt($ch, CURLOPT_USERAGENT, "Baiduspider+(+http://www.baidu.com/search/spider.htm)");

　　curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)");

　　curl_setopt($ch, CURLOPT_HEADER, 0);

　　curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);

　　curl_setopt($ch, CURLOPT_HTTPHEADER, $data);

　　$ret = curl_exec($ch);

　　$error = curl_error($ch);

　　curl_close($ch);

　　unset($ch);

　　if (!empty($error))

　　{

　　$this->write('程序抓取URL: '.$url.'发生错误，错误信息: '.$error);

　　return false;

　　}

　　if (WEB_CHARSET != 'utf-8')

　　$ret = iconv(WEB_CHARSET, 'utf-8', $ret);

　　return $ret;

　　}

　　//when check finish,mysql connect will auto close

　　private function check_mysql_connect()

【PHP可以灵活配置使用的采集器】相关文章：

php学习之php配置07-15

PHP基础配置09-25

PHP安装与配置09-09

PHP socket的配置及实例10-16

PHP配置文件详解php.ini10-19