`
conkeyn
  • 浏览: 1522935 次
  • 性别: Icon_minigender_1
  • 来自: 厦门
社区版块
存档分类
最新评论

采集数据

    博客分类:
  • PHP
阅读更多
<?
// 连接数据库的配置
$config = array(
    'url'       => LOCAL_DB_HOST.':3306',
    'user'      => LOCAL_DB_USER,
    'password'  => LOCAL_DB_PWD,
    'db'        => LOCAL_DB_NAME,
);
$is_debug = true;

// 命令行参数一为日志的开关
$log = new CommonLog(array('log_level' => $_SERVER["argv"][1],));

// 连接数据库实例
$db = new DBConfig($config);

$today = date('Ymd');
$type_list = array(

    // 'game'      => "http://top.baidu.com/buzz/game.html",

    'webgame'   => "http://top.baidu.com/buzz/mmogame.html",
    'rpg'       => "http://top.baidu.com/buzz/magic_rpg.html",
);

foreach ($type_list as $type => $url)
{
    // 采集游戏排行榜
    $max_date = null;
    $max_date = $db->query_single("select max(data_date)
                                    from web_baidu_gametop50
                                    where type = '$type'");

    if (! $max_date || $max_date < $today)
    {
        $log->debug("start at page: " . $url);


        // 读取网页内容,在读取失败时,可连续尝试9次。
        $cn = 0;
        while ($cn < 9 && ($page = @file_get_contents($url)) === FALSE)
        $cn++;

        // 编码转换,phpquery无法再gb2312的情况下处理特殊字符
        $page = preg_replace('/gb2312/i', 'utf-8', $page);

        // 转换编码, 并解析
        $doc = phpQuery::newDocumentHTML(
                mb_convert_encoding($page, 'utf-8', 'gb2312')
               );


        // 解析文档
        foreach ($doc->find("div.list > table > tbody > tr")->not(".th") as $tr)
        {
            $tr                 = pq($tr);
            $data               = array();
            $data['index_id']   = $tr->find('> th:nth-child(1)')->text();
            $data['key_name']   = $tr->find('> td:nth-child(2)')->text();

            // 今日搜索
            $data['search_num'] = $tr->find('> td:nth-child(5)')->text();

            // 最近七日
            $data['count_num']  = $tr->find('> td:nth-child(6)')->text();

            // $data['online_day'] = $tr->find('> td:nth-child(5)')->text();
            // $data['avg_num']    = $tr->find('> td:nth-child(6)')->text();

            $data['online_day'] = 0;
            $data['avg_num']    = 0;
            $data['data_date']  = $today;
            $data['type']       = $type;
            $db->insert_array("web_baidu_gametop50", $data);
        }
    } else
    {
        $log->debug("page: (" . $url . ")has gathered before");
    }
}
?>

 dd

 

 

 

分享到:
评论

相关推荐

Global site tag (gtag.js) - Google Analytics