http://'. $host .'/search/.b'. $bibn .'/.b'. $bibn .'/1%2C1%2C1%2CB/marc~b'. $bibn);
// echo ('
http://'. $host .'/record=b'. $bibn);
if(!strpos(file_get_contents('http://'. $host .'/record=b'. $bibn), 'No matches found')){
return('http://'. $host .'/search/.b'. $bibn .'/.b'. $bibn .'/1%2C1%2C1%2CB/marc~b'. $bibn);
}else{
echo '
no such record, or record is suppressed from public view
'; } } public function fetchmarc($targeturl){ $accents = array('À', 'Á', 'Â', 'Ã', 'Ä', 'Å', 'Æ', 'Ç', 'È', 'É', 'Ê', 'Ë', 'Ì', 'Í', 'Î', 'Ï', 'Ð', 'Ñ', 'Ò', 'Ó', 'Ô', 'Õ', 'Ö', 'Ø', 'Ù', 'Ú', 'Û', 'Ü', 'Ý', 'ß', 'à', 'á', 'â', 'ã', 'ä', 'å', 'æ', 'ç', 'è', 'é', 'ê', 'ë', 'ì', 'í', 'î', 'ï', 'ð', 'ñ', 'ò', 'ó', 'ô', 'õ', 'ö', 'ø', 'ù', 'ú', 'û', 'ü', 'ý', 'ÿ'); $ascii = array('A', 'A', 'A', 'A', 'A', 'A', 'AE', 'C', 'E', 'E', 'E', 'E', 'I', 'I', 'I', 'I', 'ETH', 'N', 'O', 'O', 'O', 'O', 'O', 'O', 'U', 'U', 'U', 'U', 'Y', 'ss', 'a', 'a', 'a', 'a', 'a', 'a', 'ae', 'c', 'e', 'e', 'e', 'e', 'i', 'i', 'i', 'i', 'eth', 'n', 'o', 'o', 'o', 'o', 'o', 'o', 'u', 'u', 'u', 'u', 'y', 'y'); // $record = str_replace($accents, $ascii, mb_convert_encoding(file_get_contents($targeturl), 'UTF-8')); $record = mb_convert_encoding(file_get_contents($targeturl), 'UTF-8'); preg_match('/([^<]*)/', $record, $stuff);
$record = $stuff[1];
return($record);
}
private function parse_marc_row($lineray){
//print_r($lineray);
$marcrow = array();
unset($lineray[0]);
foreach($lineray as $element){
//print_r($element);
// $element = ereg_replace('\.$', '', ereg_replace('[;|,]$', '', trim($element)));
$count[$element{0}]++;
$elementname = $element{0}.$count[$element{0}];
$marcrow[$elementname] = trim(str_replace(' ', ' ', substr($element, 1)));
}
//print_r($marcrow);
return($marcrow);
}
public function parse_marc($marcrecord){
$atomic = array();
$marcrecord = str_replace("\n ", ' ', $marcrecord);
$details = explode("\n", $marcrecord);
// unset($details[0]);
array_pop($details);
array_shift($details);
$details[0] = str_replace('LEADER ', '000 ', $details[0]);
//print_r($details);
foreach($details as $line){
unset($lineray);
unset($marc);
$line = trim($line);
$lineray = substr($line, 0, 3) . '|' . substr($line, 4, 2) . '|a' . substr($line, 7);
$atomic['rawrecordtype'] = 'iiimarc';
$atomic['rawrecord'] .= $lineray . "\n";
//echo "$lineray
";
$lineray = explode('|', ereg_replace('\.$', '', $lineray));
//print_r($lineray);
unset($lineray[1]);
// if($lineray[0] > 9)
// unset($lineray[2]);
// Authors
if(($lineray[0] == 100) || ($lineray[0] == 110)){
//print_r($line);
$marc = self::parse_marc_row($lineray);
$temp = ereg_replace(',$', '', $marc['a1'] .' '. $marc['d1']);
$atomic['author'][] = $temp;
$tags[] = 'scrib:author='. trim($temp);
}else if($lineray[0] == 110){
$marc = self::parse_marc_row($lineray);
$temp = $marc['a1'];
$atomic['author'][] = $temp;
$tags[] = 'scrib:author='. trim($temp);
}else if(($lineray[0] > 699) && ($lineray[0] < 721)){
$marc = self::parse_marc_row($lineray);
$temp = ereg_replace(',$', '', $marc['a1'] .' '. $marc['d1']);
$atomic['author'][] = $temp;
$tags[] = 'scrib:author='. trim($temp);
//Standard Numbers
}else if($lineray[0] == 10){
$marc = self::parse_marc_row($lineray);
$atomic['lccn'][] = ereg_replace('[^0-9]', '', $marc['a1']);
$tags[] = 'scrib:lccn='. trim($marc['a1']);
}else if($lineray[0] == 20){
$marc = self::parse_marc_row($lineray);
$temp = ereg_replace('[^0-9|x|X]', '', $marc['a1']);
$atomic['isbn'][] = strtolower($temp);
//Titles
}else if($lineray[0] == 245){
$marc = self::parse_marc_row($lineray);
$temp = ucwords(trim(ereg_replace('/$', '', $marc['a1']) .' '. trim(ereg_replace('/$', '', $marc['b1']))));
$atomic['title'][] = $temp;
$tags[] = 'scrib:title='. trim($temp);
$atomic['attribution'][] = $marc['c1'];
}else if($lineray[0] == 240){
$marc = self::parse_marc_row($lineray);
$temp = trim(ereg_replace('/$', '', $marc['a1'] .' '. $marc['b1']));
$atomic['alttitle'][] = $temp;
$tags[] = 'scrib:title='. trim($temp);
}else if($lineray[0] == 246){
$marc = self::parse_marc_row($lineray);
$temp = trim(ereg_replace('/$', '', $marc['a1'] .' '. $marc['b1']));
$atomic['alttitle'][] = $temp;
$tags[] = 'scrib:title='. trim($temp);
}else if(($lineray[0] > 719) && ($lineray[0] < 741)){
$marc = self::parse_marc_row($lineray);
$temp = $marc['a1'];
$atomic['alttitle'][] = $marc['a1'];
$tags[] = 'scrib:title='. trim($temp);
//Dates
}else if($lineray[0] == 260){
$marc = self::parse_marc_row($lineray);
$temp = str_pad(substr(ereg_replace('[^0-9]', '', $marc['c1']), 0, 4), 4 , '5');
$atomic['pubyear'][] = $temp;
}else if($lineray[0] == 5){
$atomic['catdate'][] = $line{7}.$line{8}.$line{9}.$line{10} .'-'. $line{11}.$line{12} .'-'. $line{13}.$line{14};
}else if($lineray[0] == 8){
$atomic['pubyear'][] = substr($line, 14, 4);
//Subjects
}else if(($lineray[0] > 599) && ($lineray[0] < 700)){
$marc = self::parse_marc_row($lineray);
$atomic['subject'][] = implode(' -- ', $marc);
if($atomic['subjkey']){
$atomic['subjkey'] = array_unique(array_merge($atomic['subjkey'], array_values($marc)));
}else{
$atomic['subjkey'] = array_values($marc);
}
//URLs
}else if($lineray[0] == 856){
$marc = self::parse_marc_row($lineray);
unset($temp);
$temp['href'] = $temp['title'] = str_replace(' ', '', $marc['u1']);
if($marc['31'])
$temp['title'] = $marc['31'];
if($marc['z1'])
$temp['title'] = $marc['z1'];
$atomic['url'][] = ''. $temp['title'] .'';
//Notes
}else if(($lineray[0] > 299) && ($lineray[0] < 400)){
$marc = self::parse_marc_row($lineray);
$atomic['physdesc'][] = implode(' ', array_values($marc));
}else if(($lineray[0] > 399) && ($lineray[0] < 500)){
$marc = self::parse_marc_row($lineray);
$atomic['title'][] = implode("\n", array_values($marc));
}else if(($lineray[0] > 799) && ($lineray[0] < 841)){
$marc = self::parse_marc_row($lineray);
$atomic['series'][] = implode("\n", array_values($marc));
}else if(($lineray[0] > 499) && ($lineray[0] < 600)){
$line = substr($line, 7);
if($lineray[0] == 504)
continue;
if($lineray[0] == 505){
$atomic['contents'][] = str_replace(array('> ','> ','> '), '>', '$line"; } //Format if((!$atomic['format']) && ($lineray[0] > 239) && ($lineray[0] < 246)){ $marc = self::parse_marc_row($lineray); $temp = ucwords(strtolower(str_replace('[', '', str_replace(']', '', $marc['h1'])))); if(eregi('^book', $temp)){ $format = 'Book'; $formats = 'Books'; }else if(eregi('^micr', $temp)){ $format = 'Microform'; }else if(eregi('^electr', $temp)){ $format = 'Website'; $formats = 'Websites'; }else if(eregi('^vid', $temp)){ $format = 'Video'; }else if(eregi('^motion', $temp)){ $format = 'Video'; }else if(eregi('^audi', $temp)){ $format = 'Audio'; }else if(eregi('^cass', $temp)){ $format = 'Audio'; }else if(eregi('^phono', $temp)){ $format = 'Audio'; }else if(eregi('^record', $temp)){ $format = 'Audio'; }else if(eregi('^sound', $temp)){ $format = 'Audio'; }else if(eregi('^carto', $temp)){ $format = 'Map'; $formats = 'Maps'; }else if(eregi('^map', $temp)){ $format = 'Map'; $formats = 'Maps'; }else if(eregi('^globe', $temp)){ $format = 'Map'; $formats = 'Maps'; }else if($temp){ $format = 'Classroom Material'; //$format = $temp; } if(!$formats) $formats = $format; if($format){ $atomic['format'][] = $format; $tags[] = 'scrib:format='. trim($formats); } } // print_r($lineray); } if(!$atomic['format'][0]){ $atomic['format'][0] = 'Book'; $tags[] = 'scrib:format=Books'; } if(!$atomic['acqdate']) $atomic['acqdate'] = $atomic['catdate']; if(!$atomic['catdate'][0]) $atomic['catdate'][0] = '1984-01-01'; if($atomic['pubyear'][0] > (date(Y) + 5)) $atomic['pubyear'][0] = substr($atomic['catdate'][0],0,4); if($atomic['pubyear'][0]){ $atomic['pubdate'] = $atomic['pubyear'][0].substr($atomic['catdate'][0],4); $tags[] = 'scrib:pubyear='. $atomic['pubyear'][0]; } foreach($atomic['isbn'] as $temp){ $tags[] = 'scrib:isbn='. trim($temp); } if($atomic['alttitle']) $atomic['title'] = array_unique(array_merge($atomic['title'], $atomic['alttitle'])); foreach($atomic['subjkey'] as $temp){ $tags[] = 'scrib:subject='. trim($temp); } $atomic['tags'] = $tags; // print_r($atomic); return($atomic); } public function scrapeit($host, $bibn){ global $scrib, $wpdb, $scrib_importer, $scrib_templates; // if(!$scrib_importer->post_exists(substr(ereg_replace('[^a-z|0-9]', '', strtolower($_REQUEST['sourceid'])), 0, 2) . $bibn)){ $iiirecord = $this->parse_marc($this->fetchmarc($this->marcurl($host, $bibn))); if($iiirecord['title']){ if($enrichinfo = $scrib_importer->enrich($iiirecord['isbn'])){ //print_r($enrichinfo); $iiirecord['img'] = $enrichinfo['img']; if($enrichinfo['summary']){ echo 's'; $iiirecord['shortdescription'] = $enrichinfo['summary']; } } $iiirecord['the_title'] = $iiirecord['title'][0]; $iiirecord['the_pubdate'] = $iiirecord['pubdate'][0]; $iiirecord['the_acqdate'] = $iiirecord['acqdate'][0]; $iiirecord['the_sourceid'] = substr(ereg_replace('[^a-z|0-9]', '', strtolower($_REQUEST['sourceid'])), 0, 2) . $bibn; $iiirecord['tags'][] = 'scrib:sourceid='. $iiirecord['the_sourceid']; $iiirecord['the_excerpt'] = $scrib_templates->summary($iiirecord); $iiirecord['the_content'] = $scrib_templates->full($iiirecord); //print_r($iiirecord); // $scrib_importer->insertpost($iiirecord); flush(); // if(($iiirecord['img']['thumb']['url'] && ($bibn < 1200000)) || (!$scrib_importer->post_exists(substr(ereg_replace('[^a-z|0-9]', '', strtolower($_REQUEST['sourceid'])), 0, 2) . $bibn))){ // $scrib_importer->insertpost($iiirecord); $scrib_importer->insertharvest($iiirecord); flush(); // }else{ // echo "
ignored $bibn, already in catalog.
"; // } } } } if($_REQUEST['sourceid'] && $_REQUEST['host'] && $_REQUEST['bibn']){ $scrib_III_scraper = new Scrib_III_scraper; $scrib_importer = new Scrib_importer; $scrib_templates = new Scrib_default_templates; if($_REQUEST['to'] && $_REQUEST['bibn'] <= $_REQUEST['to']){ for ($i = $_REQUEST['bibn']; $i < $_REQUEST['bibn'] + 25; $i++) { $scrib_III_scraper->scrapeit($_REQUEST['host'], $i); } $nextbibn = $i; echo ' '; }else{ define('SCRIB_DEBUG', true); $scrib_III_scraper->scrapeit($_REQUEST['host'], $_REQUEST['bibn']); } $prefs['lasthost'] = $_REQUEST['host']; $prefs['lastbibn'] = $_REQUEST['bibn']; $prefs['lastsourceid'] = $_REQUEST['sourceid']; update_option('scrib_IIIimporter', $prefs); }else{ $prefs = get_option('scrib_IIIimporter'); ?>