([^<]*)/', $record, $stuff); $record = $stuff[1]; return($record); } private function parse_marc_row($lineray){ //print_r($lineray); $marcrow = array(); unset($lineray[0]); foreach($lineray as $element){ //print_r($element); // $element = ereg_replace('\.$', '', ereg_replace('[;|,]$', '', trim($element))); $count[$element{0}]++; $elementname = $element{0}.$count[$element{0}]; $marcrow[$elementname] = trim(str_replace(' ', ' ', substr($element, 1))); } //print_r($marcrow); return($marcrow); } public function parse_marc($marcrecord){ $atomic = array(); $marcrecord = str_replace("\n ", ' ', $marcrecord); $details = explode("\n", $marcrecord); // unset($details[0]); array_pop($details); array_shift($details); $details[0] = str_replace('LEADER ', '000 ', $details[0]); //print_r($details); foreach($details as $line){ unset($lineray); unset($marc); $line = trim($line); $lineray = substr($line, 0, 3) . '|' . substr($line, 4, 2) . '|a' . substr($line, 7); $atomic['rawrecordtype'] = 'iiimarc'; $atomic['rawrecord'] .= $lineray . "\n"; //echo "

$lineray

"; $lineray = explode('|', ereg_replace('\.$', '', $lineray)); //print_r($lineray); unset($lineray[1]); // if($lineray[0] > 9) // unset($lineray[2]); // Authors if(($lineray[0] == 100) || ($lineray[0] == 110)){ //print_r($line); $marc = self::parse_marc_row($lineray); $temp = ereg_replace(',$', '', $marc['a1'] .' '. $marc['d1']); $atomic['author'][] = $temp; $tags[] = 'scrib:author='. trim($temp); }else if($lineray[0] == 110){ $marc = self::parse_marc_row($lineray); $temp = $marc['a1']; $atomic['author'][] = $temp; $tags[] = 'scrib:author='. trim($temp); }else if(($lineray[0] > 699) && ($lineray[0] < 721)){ $marc = self::parse_marc_row($lineray); $temp = ereg_replace(',$', '', $marc['a1'] .' '. $marc['d1']); $atomic['author'][] = $temp; $tags[] = 'scrib:author='. trim($temp); //Standard Numbers }else if($lineray[0] == 10){ $marc = self::parse_marc_row($lineray); $atomic['lccn'][] = ereg_replace('[^0-9]', '', $marc['a1']); $tags[] = 'scrib:lccn='. trim($marc['a1']); }else if($lineray[0] == 20){ $marc = self::parse_marc_row($lineray); $temp = ereg_replace('[^0-9|x|X]', '', $marc['a1']); $atomic['isbn'][] = strtolower($temp); //Titles }else if($lineray[0] == 245){ $marc = self::parse_marc_row($lineray); $temp = ucwords(trim(ereg_replace('/$', '', $marc['a1']) .' '. trim(ereg_replace('/$', '', $marc['b1'])))); $atomic['title'][] = $temp; $tags[] = 'scrib:title='. trim($temp); $atomic['attribution'][] = $marc['c1']; }else if($lineray[0] == 240){ $marc = self::parse_marc_row($lineray); $temp = trim(ereg_replace('/$', '', $marc['a1'] .' '. $marc['b1'])); $atomic['alttitle'][] = $temp; $tags[] = 'scrib:title='. trim($temp); }else if(($lineray[0] > 719) && ($lineray[0] < 741)){ $marc = self::parse_marc_row($lineray); $temp = $marc['a1']; $atomic['alttitle'][] = $marc['a1']; $tags[] = 'scrib:title='. trim($temp); //Dates }else if($lineray[0] == 260){ $marc = self::parse_marc_row($lineray); $temp = str_pad(substr(ereg_replace('[^0-9]', '', $marc['c1']), 0, 4), 4 , '5'); $atomic['pubyear'][] = $temp; }else if($lineray[0] == 5){ $atomic['catdate'][] = $line{7}.$line{8}.$line{9}.$line{10} .'-'. $line{11}.$line{12} .'-'. $line{13}.$line{14}; }else if($lineray[0] == 8){ $atomic['pubyear'][] = substr($line, 14, 4); //Subjects }else if(($lineray[0] > 599) && ($lineray[0] < 700)){ $marc = self::parse_marc_row($lineray); $atomic['subject'][] = implode(' -- ', $marc); if($atomic['subjkey']){ $atomic['subjkey'] = array_unique(array_merge($atomic['subjkey'], array_values($marc))); }else{ $atomic['subjkey'] = array_values($marc); } //URLs }else if($lineray[0] == 856){ $marc = self::parse_marc_row($lineray); unset($temp); $temp['href'] = $temp['title'] = str_replace(' ', '', $marc['u1']); if($marc['31']) $temp['title'] = $marc['31']; if($marc['z1']) $temp['title'] = $marc['z1']; $atomic['url'][] = ''. $temp['title'] .''; //Notes }else if(($lineray[0] > 299) && ($lineray[0] < 400)){ $marc = self::parse_marc_row($lineray); $atomic['physdesc'][] = implode(' ', array_values($marc)); }else if(($lineray[0] > 399) && ($lineray[0] < 500)){ $marc = self::parse_marc_row($lineray); $atomic['title'][] = implode("\n", array_values($marc)); }else if(($lineray[0] > 799) && ($lineray[0] < 841)){ $marc = self::parse_marc_row($lineray); $atomic['series'][] = implode("\n", array_values($marc)); }else if(($lineray[0] > 499) && ($lineray[0] < 600)){ $line = substr($line, 7); if($lineray[0] == 504) continue; if($lineray[0] == 505){ $atomic['contents'][] = str_replace(array('> ','> ','> '), '>', '
  • '. str_replace('--', "
  • \n
  • ", trim(str_replace(array(' ', ' ', ' '), ' ', $line))) .'
  • '); continue; } $atomic['notes'][] = str_replace(' ', ' ', $line); //echo "

    Notes:

    $line
    "; } //Format if((!$atomic['format']) && ($lineray[0] > 239) && ($lineray[0] < 246)){ $marc = self::parse_marc_row($lineray); $temp = ucwords(strtolower(str_replace('[', '', str_replace(']', '', $marc['h1'])))); if(eregi('^book', $temp)){ $format = 'Book'; $formats = 'Books'; }else if(eregi('^micr', $temp)){ $format = 'Microform'; }else if(eregi('^electr', $temp)){ $format = 'Website'; $formats = 'Websites'; }else if(eregi('^vid', $temp)){ $format = 'Video'; }else if(eregi('^motion', $temp)){ $format = 'Video'; }else if(eregi('^audi', $temp)){ $format = 'Audio'; }else if(eregi('^cass', $temp)){ $format = 'Audio'; }else if(eregi('^phono', $temp)){ $format = 'Audio'; }else if(eregi('^record', $temp)){ $format = 'Audio'; }else if(eregi('^sound', $temp)){ $format = 'Audio'; }else if(eregi('^carto', $temp)){ $format = 'Map'; $formats = 'Maps'; }else if(eregi('^map', $temp)){ $format = 'Map'; $formats = 'Maps'; }else if(eregi('^globe', $temp)){ $format = 'Map'; $formats = 'Maps'; }else if($temp){ $format = 'Classroom Material'; //$format = $temp; } if(!$formats) $formats = $format; if($format){ $atomic['format'][] = $format; $tags[] = 'scrib:format='. trim($formats); } } // print_r($lineray); } if(!$atomic['format'][0]){ $atomic['format'][0] = 'Book'; $tags[] = 'scrib:format=Books'; } if(!$atomic['catdate'][0]) $atomic['catdate'][0] = '1984-01-01'; if($atomic['pubyear'][0] > (date(Y) + 5)) $atomic['pubyear'][0] = substr($atomic['catdate'][0],0,4); if($atomic['pubyear'][0]){ $atomic['pubdate'] = $atomic['pubyear'][0].substr($atomic['catdate'][0],4); $tags[] = 'scrib:pubyear='. $atomic['pubyear'][0]; } foreach($atomic['isbn'] as $temp){ $tags[] = 'scrib:isbn='. trim($temp); } if($atomic['alttitle']) $atomic['title'] = array_unique(array_merge($atomic['title'], $atomic['alttitle'])); foreach($atomic['subjkey'] as $temp){ $tags[] = 'scrib:subject='. trim($temp); } $atomic['tags'] = $tags; // print_r($atomic); return($atomic); } public function scrapeit($host, $bibn){ global $scrib, $wpdb, $scrib_importer, $scrib_templates; // if(!$scrib_importer->post_exists(substr(ereg_replace('[^a-z|0-9]', '', strtolower($_REQUEST['sourceid'])), 0, 2) . $bibn)){ $iiirecord = $this->parse_marc($this->fetchmarc($this->marcurl($host, $bibn))); if($iiirecord['title']){ if($scrib->options['amazonapi'] && $enrichinfo = $scrib_importer->enrich($iiirecord['isbn'])){ //print_r($amazoninfo); $iiirecord['asin'] = $amazoninfo['asin']; $iiirecord['img'] = $amazoninfo['img']; $iiirecord['amzn'] = $amazoninfo; if($iiirecord['amzn']['review'][0]['content']) $iiirecord['shortdescription'] = $scrib_importer->summarize($iiirecord['amzn']['review'][0]['content']); } $iiirecord['the_title'] = $iiirecord['title'][0]; $iiirecord['the_pubdate'] = $iiirecord['pubdate'][0]; $iiirecord['the_catdate'] = $iiirecord['catdate'][0]; $iiirecord['the_sourceid'] = substr(ereg_replace('[^a-z|0-9]', '', strtolower($_REQUEST['sourceid'])), 0, 2) . $bibn; $iiirecord['tags'][] = 'scrib:sourceid='. $iiirecord['the_sourceid']; $iiirecord['the_excerpt'] = $scrib_templates->summary($iiirecord); $iiirecord['the_content'] = $scrib_templates->full($iiirecord); //print_r($iiirecord); // $scrib_importer->insertpost($iiirecord); flush(); // if(($iiirecord['img']['thumb']['url'] && ($bibn < 1200000)) || (!$scrib_importer->post_exists(substr(ereg_replace('[^a-z|0-9]', '', strtolower($_REQUEST['sourceid'])), 0, 2) . $bibn))){ // $scrib_importer->insertpost($iiirecord); $scrib_importer->insertharvest($iiirecord); flush(); // }else{ // echo "

    ignored $bibn, already in catalog.

    "; // } } } } if($_REQUEST['sourceid'] && $_REQUEST['host'] && $_REQUEST['bibn']){ $scrib_III_scraper = new Scrib_III_scraper; $scrib_importer = new Scrib_importer; $scrib_templates = new Scrib_default_templates; if($_REQUEST['to'] && $_REQUEST['bibn'] <= $_REQUEST['to']){ for ($i = $_REQUEST['bibn']; $i < $_REQUEST['bibn'] + 25; $i++) { $scrib_III_scraper->scrapeit($_REQUEST['host'], $i); } $nextbibn = $i; echo ' '; }else{ define('SCRIB_DEBUG', true); $scrib_III_scraper->scrapeit($_REQUEST['host'], $_REQUEST['bibn']); } $prefs['lasthost'] = $_REQUEST['host']; $prefs['lastbibn'] = $_REQUEST['bibn']; $prefs['lastsourceid'] = $_REQUEST['sourceid']; update_option('scrib_IIIimporter', $prefs); }else{ $prefs = get_option('scrib_IIIimporter'); ?>

    example: lola.plymouth.edu (no http://)

    example: 1069003 (the first (or only) record to process)

    example: 1070000 (the last record)

    example: bb (must be two characters, a-z and 0-9 accepted)

    scrape records directly from your III web OPAC.