OMwiki:Tech/OMwiki-import.php
(Difference between revisions)
(typo fixes + copy and paste IA description) |
(update hack. PIDs are hidden + processes killed after a few minutes on shared hosting :-/) |
||
Line 19: | Line 19: | ||
require_once ( '../../../maintenance/commandLine.inc' ); | require_once ( '../../../maintenance/commandLine.inc' ); | ||
- | define( 'MV_DOWNLOAD_DIR', '/home/openmeet/www/ | + | define( 'MV_DOWNLOAD_DIR', '/home/openmeet/www/archives/' ); |
- | define( 'MV_ARCHIVE_ORG_DL', 'http:// | + | define( 'MV_ARCHIVE_ORG_DL', 'http://ia301540.us.archive.org/3/items/ovc_internetarchive_19june2009/' ); |
define( 'ARCHIVE_ORG_SKIP_LIST', '' ); | define( 'ARCHIVE_ORG_SKIP_LIST', '' ); | ||
Line 33: | Line 33: | ||
if ( count( $args ) == 0 || isset ( $options['help'] ) ) { | if ( count( $args ) == 0 || isset ( $options['help'] ) ) { | ||
print " | print " | ||
- | Downloads files from archive.org to configured directory:{$mvDownloadDir} | + | Downloads files from archive.org to configured directory: {$mvDownloadDir} |
options keyword: | options keyword: | ||
all //to get all | all //to get all | ||
- | [stream_name] //to grab a | + | [stream_name] //to grab a specific stream |
"; | "; | ||
Line 48: | Line 48: | ||
$sql = "SELECT * FROM `mv_streams` LIMIT 0, 5000"; | $sql = "SELECT * FROM `mv_streams` LIMIT 0, 5000"; | ||
} else { | } else { | ||
- | $sql = "SELECT * FROM `mv_streams` WHERE `name` | + | $sql = "SELECT * FROM `mv_streams` WHERE `name` LIKE '$stream_name'"; |
} | } | ||
$dbr = wfGetDB( DB_READ ); | $dbr = wfGetDB( DB_READ ); | ||
Line 55: | Line 55: | ||
$skip_list = unserialize( file_get_contents( ARCHIVE_ORG_SKIP_LIST ) ); | $skip_list = unserialize( file_get_contents( ARCHIVE_ORG_SKIP_LIST ) ); | ||
while ( $stream = $dbr->fetchObject( $result ) ) { | while ( $stream = $dbr->fetchObject( $result ) ) { | ||
- | $local_fl = MV_DOWNLOAD_DIR . $stream->name . '.ogv'; | + | $local_fl = MV_DOWNLOAD_DIR . strtolower($stream->name) . '.ogv'; |
- | + | // $remote_fl = MV_ARCHIVE_ORG_DL . strtolower($stream->name) . '/format=Ogg+Theora'; | |
- | $ | + | $remote_fl = MV_ARCHIVE_ORG_DL . strtolower($stream->name) . '.ogv'; |
- | + | ||
- | + | ||
+ | $remote_meta = MV_ARCHIVE_ORG_DL . strtolower($stream->name) . '/format=Metadata'; | ||
+ | $remote_animatedthumb = MV_ARCHIVE_ORG_DL . strtolower($stream->name) . '/format=Animated Gif'; | ||
+ | $remote_torrent = MV_ARCHIVE_ORG_DL . strtolower($stream->name) . '/format=Structural Metadata'; | ||
+ | |||
- | if ( is_file( $local_fl ) . META_DATA_EXT && is_file( $local_fl ) ) { | + | /* if ( is_file( $local_fl ) . META_DATA_EXT && is_file( $local_fl ) ) { |
// check db table for updated mv_flash_low_quality ref | // check db table for updated mv_flash_low_quality ref | ||
$sql = " SELECT * FROM `mv_stream_files` WHERE `stream_id`='" . $stream->id . "' " . | $sql = " SELECT * FROM `mv_stream_files` WHERE `stream_id`='" . $stream->id . "' " . | ||
Line 97: | Line 99: | ||
$dbw->query( $sql ); | $dbw->query( $sql ); | ||
- | } | + | } */ |
- | + | ||
- | // lets just skip local files for now and try to remove incomplete | + | // lets just skip local files for now and try to remove incomplete manually |
if ( isset( $skip_list[$stream->name] ) ) { | if ( isset( $skip_list[$stream->name] ) ) { | ||
print "skipping:" . $stream->name . "\n"; | print "skipping:" . $stream->name . "\n"; | ||
Line 106: | Line 108: | ||
} | } | ||
- | + | ||
- | + | ||
- | + | ||
- | + | ||
- | + | ||
- | + | ||
- | + | ||
- | + | ||
- | + | ||
- | + | ||
- | + | ||
- | + | ||
- | + | ||
- | + | ||
- | + | ||
- | + | ||
- | + | ||
- | + | ||
- | + | ||
- | + | ||
- | + | ||
- | + | ||
- | + | ||
echo "DL it: $remote_fl \n"; | echo "DL it: $remote_fl \n"; | ||
if ( curldownload( $remote_fl, $local_fl ) ) { | if ( curldownload( $remote_fl, $local_fl ) ) { | ||
- | echo 'succesfully grabed ' . $remote_fl . "\n"; | + | echo 'succesfully grabed: ' . $remote_fl . "\n"; |
} ; | } ; | ||
- | + | ||
if ( !is_file( $local_fl ) . META_DATA_EXT ) { | if ( !is_file( $local_fl ) . META_DATA_EXT ) { | ||
- | echo " | + | echo "generating RSS entry (FIXME)...\n"; |
$flv = new MyFLV(); | $flv = new MyFLV(); | ||
- | try { | + | /*try { |
$flv->open( $local_fl ); | $flv->open( $local_fl ); | ||
} catch ( Exception $e ) { | } catch ( Exception $e ) { | ||
Line 143: | Line 123: | ||
} | } | ||
$flv->getMetaData(); | $flv->getMetaData(); | ||
- | echo "done with .meta (" . filesize( $local_fl . META_DATA_EXT ) . ") \n"; | + | echo "done with .meta (" . filesize( $local_fl . META_DATA_EXT ) . ") \n"; */ |
} | } | ||
// add to skip list | // add to skip list | ||
Line 155: | Line 135: | ||
function curldownload( $remote_file, $local_file ) { | function curldownload( $remote_file, $local_file ) { | ||
$pid = simple_run_background( "curl -L -C - -o $local_file $remote_file" ); | $pid = simple_run_background( "curl -L -C - -o $local_file $remote_file" ); | ||
- | print " | + | print "Started CURL download with pid: $pid \n"; |
$remote_size = remotefsize( $remote_file ); | $remote_size = remotefsize( $remote_file ); | ||
$prev_byte = 0; | $prev_byte = 0; | ||
- | while ( | + | while ( true ) { |
$speed = hr_bytes( ( filesize( $local_file ) - $prev_byte ) / 10 ); | $speed = hr_bytes( ( filesize( $local_file ) - $prev_byte ) / 10 ); | ||
echo "downloaded (" . hr_bytes( filesize( $local_file ) ) . ' of ' . hr_bytes( $remote_size ) . ") " . $speed . "/s \n"; | echo "downloaded (" . hr_bytes( filesize( $local_file ) ) . ' of ' . hr_bytes( $remote_size ) . ") " . $speed . "/s \n"; | ||
$prev_byte = filesize( $local_file ); | $prev_byte = filesize( $local_file ); | ||
+ | if ( $prev_byte == $remote_size ){break;} | ||
clearstatcache(); | clearstatcache(); | ||
- | sleep( | + | sleep( 120 ); |
} | } | ||
return true; | return true; | ||
Line 236: | Line 217: | ||
} | } | ||
function simple_run_background( $command ) { | function simple_run_background( $command ) { | ||
- | $PID = | + | $PID = exec( "nohup $command > /dev/null & echo $!" ); |
return $PID; | return $PID; | ||
} | } | ||
// Verifies if a process is running in linux | // Verifies if a process is running in linux | ||
- | function is_process_running( $PID ) { | + | /* function is_process_running( $PID ) { |
$ProcessState = ''; | $ProcessState = ''; | ||
exec( "ps $PID", $ProcessState ); | exec( "ps $PID", $ProcessState ); | ||
return( count( $ProcessState ) >= 2 ); | return( count( $ProcessState ) >= 2 ); | ||
- | } | + | } */ |
function hr_bytes( $size ) { | function hr_bytes( $size ) { | ||
$size = (int)$size; | $size = (int)$size; | ||
Line 256: | Line 237: | ||
} | } | ||
?> | ?> | ||
+ | |||
+ | |||
</pre> | </pre> |
Current revision as of 06:52, 7 January 2010
What this script should be capable of once finished:
- Copy video files into the /archives directory, followed by MD5SUM and oggz-validate checks.
- Insert the video into the MetaVidWiki database (mv_streams); also populate date_start_time from unix_timestamp (located in IA metadata).
- Generate/insert <items> into Media RSS feeds, followed by feed validation.
- Insert an animated thumbnail into OMwiki:Finding aid.
- Copy the Internet Archive description into corresponding wikitext.
<?php $cur_path = $IP = dirname( __FILE__ ); // include commandLine.inc from the mediaWiki maintance dir: require_once ( '../../../maintenance/commandLine.inc' ); define( 'MV_DOWNLOAD_DIR', '/home/openmeet/www/archives/' ); define( 'MV_ARCHIVE_ORG_DL', 'http://ia301540.us.archive.org/3/items/ovc_internetarchive_19june2009/' ); define( 'ARCHIVE_ORG_SKIP_LIST', '' ); //define( 'MV_BASE_MEDIA_SERVER_PATH', 'http://mvbox2.cse.ucsc.edu/mvFlvServer.php/' ); define( 'META_DATA_EXT', '.meta' ); // for gennerate flv metadata: //include_once( '../skins/mv_embed/flvServer/MvFlv.php' ); if ( count( $args ) == 0 || isset ( $options['help'] ) ) { print " Downloads files from archive.org to configured directory: {$mvDownloadDir} options keyword: all //to get all [stream_name] //to grab a specific stream "; } else { proccess_streams( $args[0] ); } function proccess_streams( $stream_name = 'all' ) { if ( $stream_name == 'all' ) { $sql = "SELECT * FROM `mv_streams` LIMIT 0, 5000"; } else { $sql = "SELECT * FROM `mv_streams` WHERE `name` LIKE '$stream_name'"; } $dbr = wfGetDB( DB_READ ); $dbw = wfGetDB( DB_WRITE ); $result = $dbr->query( $sql ); $skip_list = unserialize( file_get_contents( ARCHIVE_ORG_SKIP_LIST ) ); while ( $stream = $dbr->fetchObject( $result ) ) { $local_fl = MV_DOWNLOAD_DIR . strtolower($stream->name) . '.ogv'; // $remote_fl = MV_ARCHIVE_ORG_DL . strtolower($stream->name) . '/format=Ogg+Theora'; $remote_fl = MV_ARCHIVE_ORG_DL . strtolower($stream->name) . '.ogv'; $remote_meta = MV_ARCHIVE_ORG_DL . strtolower($stream->name) . '/format=Metadata'; $remote_animatedthumb = MV_ARCHIVE_ORG_DL . strtolower($stream->name) . '/format=Animated Gif'; $remote_torrent = MV_ARCHIVE_ORG_DL . strtolower($stream->name) . '/format=Structural Metadata'; /* if ( is_file( $local_fl ) . META_DATA_EXT && is_file( $local_fl ) ) { // check db table for updated mv_flash_low_quality ref $sql = " SELECT * FROM `mv_stream_files` WHERE `stream_id`='" . $stream->id . "' " . " AND `file_desc_msg`='mv_flash_low_quality'"; $resFcheck = $dbr->query( $sql ); if ( $dbr->numRows( $resFcheck ) == 0 ) { // grab duration from mv_ogg_low_quality $sql = " SELECT * FROM `mv_stream_files` WHERE `stream_id`='" . $stream->id . "' " . " AND `file_desc_msg`='mv_ogg_low_quality'"; $rdur = $dbr->query( $sql ); $dur_val = 0; if ( $dbr->numRows( $rdur ) ) { $ogg_file = $dbr->fetchObject( $rdur ); $dur_val = $ogg_file->duration; } $dbw->insert( 'mv_stream_files', array( 'stream_id' => $stream->id, 'duration' => $dur_val, 'file_desc_msg' => 'mv_flash_low_quality', 'path_type' => 'url_anx', 'path' => MV_BASE_MEDIA_SERVER_PATH . $stream->name . ".flv'" ) ); print $dbw->lastQuery(); die; print "insert {$stream->name}.flv\n"; // $dbw->query($sql); } else { $file = $dbr->fetchObject( $resFcheck ); $dbr->update( 'mv_stream_files', array( 'path' => MV_BASE_MEDIA_SERVER_PATH . $stream->name . '.flv' ), array( 'id' => $file->id ), __METHOD__, array( 'LIMIT' => 1 ) ); $dbw->query( $sql ); } */ // lets just skip local files for now and try to remove incomplete manually if ( isset( $skip_list[$stream->name] ) ) { print "skipping:" . $stream->name . "\n"; continue; } echo "DL it: $remote_fl \n"; if ( curldownload( $remote_fl, $local_fl ) ) { echo 'succesfully grabed: ' . $remote_fl . "\n"; } ; if ( !is_file( $local_fl ) . META_DATA_EXT ) { echo "generating RSS entry (FIXME)...\n"; $flv = new MyFLV(); /*try { $flv->open( $local_fl ); } catch ( Exception $e ) { die( "The following exception was detected while trying to open a FLV file:\n" . $e->getMessage() . "" ); } $flv->getMetaData(); echo "done with .meta (" . filesize( $local_fl . META_DATA_EXT ) . ") \n"; */ } // add to skip list if ( is_file( $local_fl ) . META_DATA_EXT && is_file( $local_fl ) ) { $skip_list[$stream->name] = true; } file_put_contents( ARCHIVE_ORG_SKIP_LIST, serialize( $skip_list ) ); } } function curldownload( $remote_file, $local_file ) { $pid = simple_run_background( "curl -L -C - -o $local_file $remote_file" ); print "Started CURL download with pid: $pid \n"; $remote_size = remotefsize( $remote_file ); $prev_byte = 0; while ( true ) { $speed = hr_bytes( ( filesize( $local_file ) - $prev_byte ) / 10 ); echo "downloaded (" . hr_bytes( filesize( $local_file ) ) . ' of ' . hr_bytes( $remote_size ) . ") " . $speed . "/s \n"; $prev_byte = filesize( $local_file ); if ( $prev_byte == $remote_size ){break;} clearstatcache(); sleep( 120 ); } return true; } function download ( $file_source, $file_target, $sn ) { // Preparations $file_source = str_replace( ' ', '%20', html_entity_decode( $file_source ) ); // fix url format if ( file_exists( $file_target ) ) { chmod( $file_target, 0777 ); } // add write permission $remote_size = remotefsize( $file_source ); // Begin transfer if ( ( $rh = fopen( $file_source, 'rb' ) ) === FALSE ) { return false; } // fopen() handles if ( ( $wh = fopen( $file_target, 'wb' ) ) === FALSE ) { return false; } // error messages. $i = 0; while ( !feof( $rh ) ) { // report progress every 2000 if ( $i == 2000 ) { $i = 0; $lfs = filesize( $file_target ); print hr_bytes( $lfs ) . ' of ' . hr_bytes( $remote_size ) . " of $sn \n"; clearstatcache(); } $i++; // unable to write to file, possibly because the harddrive has filled up if ( fwrite( $wh, fread( $rh, 8192 ) ) === FALSE ) { fclose( $rh ); fclose( $wh ); return false; } } // Finished without errors fclose( $rh ); fclose( $wh ); return true; } function remotefsize( $url ) { // $sch = parse_url($url, PHP_URL_SCHEME); // if (($sch != "http") && ($sch != "https") && ($sch != "ftp") && ($sch != "ftps")) { // return false; // } $sch = 'http'; if ( ( $sch == "http" ) || ( $sch == "https" ) ) { $headers = get_headers( $url, 1 ); if ( ( !array_key_exists( "Content-Length", $headers ) ) ) { return false; } return $headers["Content-Length"]; } if ( ( $sch == "ftp" ) || ( $sch == "ftps" ) ) { $server = parse_url( $url, PHP_URL_HOST ); $port = parse_url( $url, PHP_URL_PORT ); $path = parse_url( $url, PHP_URL_PATH ); $user = parse_url( $url, PHP_URL_USER ); $pass = parse_url( $url, PHP_URL_PASS ); if ( ( !$server ) || ( !$path ) ) { return false; } if ( !$port ) { $port = 21; } if ( !$user ) { $user = "anonymous"; } if ( !$pass ) { $pass = "phpos@"; } switch ( $sch ) { case "ftp": $ftpid = ftp_connect( $server, $port ); break; case "ftps": $ftpid = ftp_ssl_connect( $server, $port ); break; } if ( !$ftpid ) { return false; } $login = ftp_login( $ftpid, $user, $pass ); if ( !$login ) { return false; } $ftpsize = ftp_size( $ftpid, $path ); ftp_close( $ftpid ); if ( $ftpsize == - 1 ) { return false; } return $ftpsize; } } function simple_run_background( $command ) { $PID = exec( "nohup $command > /dev/null & echo $!" ); return $PID; } // Verifies if a process is running in linux /* function is_process_running( $PID ) { $ProcessState = ''; exec( "ps $PID", $ProcessState ); return( count( $ProcessState ) >= 2 ); } */ function hr_bytes( $size ) { $size = (int)$size; $a = array( "B", "KB", "MB", "GB", "TB", "PB" ); $pos = 0; while ( $size >= 1024 ) { $size /= 1024; $pos++; } return round( $size, 2 ) . " " . $a[$pos]; } ?>