http://'. $host .'/search/.b'. $bibn .'/.b'. $bibn .'/1%2C1%2C1%2CB/marc~b'. $bibn); // echo ('
http://'. $host .'/record=b'. $bibn); if(!strpos(file_get_contents('http://'. $host .'/record=b'. $bibn), 'No matches found')){ return('http://'. $host .'/search/.b'. $bibn .'/.b'. $bibn .'/1%2C1%2C1%2CB/marc~b'. $bibn); }else{ echo '

no such record, or record is suppressed from public view

'; } } public function fetchmarc($targeturl){ $accents = array('À', 'Á', 'Â', 'Ã', 'Ä', 'Å', 'Æ', 'Ç', 'È', 'É', 'Ê', 'Ë', 'Ì', 'Í', 'Î', 'Ï', 'Ð', 'Ñ', 'Ò', 'Ó', 'Ô', 'Õ', 'Ö', 'Ø', 'Ù', 'Ú', 'Û', 'Ü', 'Ý', 'ß', 'à', 'á', 'â', 'ã', 'ä', 'å', 'æ', 'ç', 'è', 'é', 'ê', 'ë', 'ì', 'í', 'î', 'ï', 'ð', 'ñ', 'ò', 'ó', 'ô', 'õ', 'ö', 'ø', 'ù', 'ú', 'û', 'ü', 'ý', 'ÿ'); $ascii = array('A', 'A', 'A', 'A', 'A', 'A', 'AE', 'C', 'E', 'E', 'E', 'E', 'I', 'I', 'I', 'I', 'ETH', 'N', 'O', 'O', 'O', 'O', 'O', 'O', 'U', 'U', 'U', 'U', 'Y', 'ss', 'a', 'a', 'a', 'a', 'a', 'a', 'ae', 'c', 'e', 'e', 'e', 'e', 'i', 'i', 'i', 'i', 'eth', 'n', 'o', 'o', 'o', 'o', 'o', 'o', 'u', 'u', 'u', 'u', 'y', 'y'); // $record = str_replace($accents, $ascii, mb_convert_encoding(file_get_contents($targeturl), 'UTF-8')); $record = mb_convert_encoding(file_get_contents($targeturl), 'UTF-8'); preg_match('/
([^<]*)/', $record, $stuff);
		$record = $stuff[1];
		return($record);	
	}

	private function parse_marc_row($lineray){
//print_r($lineray);
		$marcrow = array();
		unset($lineray[0]);
		foreach($lineray as $element){
//print_r($element);
//			$element = ereg_replace('\.$', '', ereg_replace('[;|,]$', '', trim($element)));
			$count[$element{0}]++;
			$elementname = $element{0}.$count[$element{0}];
			$marcrow[$elementname] = trim(str_replace('  ', ' ', substr($element, 1)));
		}
//print_r($marcrow);
		return($marcrow);
	}


	public function parse_marc($marcrecord){
		$atomic = array();
		
		$marcrecord = str_replace("\n       ", ' ', $marcrecord);
		
		$details = explode("\n", $marcrecord);
//		unset($details[0]);
		array_pop($details);
		array_shift($details);

		$details[0] = str_replace('LEADER ', '000    ', $details[0]);
//print_r($details);
		foreach($details as $line){		
			unset($lineray);
			unset($marc);
			$line = trim($line);
			$lineray = substr($line, 0, 3) . '|' . substr($line, 4, 2) . '|a' . substr($line, 7);
			$atomic['rawrecordtype'] = 'iiimarc';
			$atomic['rawrecord'] .= $lineray . "\n";
//echo "

$lineray

"; $lineray = explode('|', ereg_replace('\.$', '', $lineray)); //print_r($lineray); unset($lineray[1]); // if($lineray[0] > 9) // unset($lineray[2]); // Authors if(($lineray[0] == 100) || ($lineray[0] == 110)){ //print_r($line); $marc = self::parse_marc_row($lineray); $temp = ereg_replace(',$', '', $marc['a1'] .' '. $marc['d1']); $atomic['author'][] = $temp; $tags[] = 'scrib:author='. trim($temp); }else if($lineray[0] == 110){ $marc = self::parse_marc_row($lineray); $temp = $marc['a1']; $atomic['author'][] = $temp; $tags[] = 'scrib:author='. trim($temp); }else if(($lineray[0] > 699) && ($lineray[0] < 721)){ $marc = self::parse_marc_row($lineray); $temp = ereg_replace(',$', '', $marc['a1'] .' '. $marc['d1']); $atomic['author'][] = $temp; $tags[] = 'scrib:author='. trim($temp); //Standard Numbers }else if($lineray[0] == 10){ $marc = self::parse_marc_row($lineray); $atomic['lccn'][] = ereg_replace('[^0-9]', '', $marc['a1']); $tags[] = 'scrib:lccn='. trim($marc['a1']); }else if($lineray[0] == 20){ $marc = self::parse_marc_row($lineray); $temp = ereg_replace('[^0-9|x|X]', '', $marc['a1']); $atomic['isbn'][] = strtolower($temp); //Titles }else if($lineray[0] == 245){ $marc = self::parse_marc_row($lineray); $temp = ucwords(trim(ereg_replace('/$', '', $marc['a1']) .' '. trim(ereg_replace('/$', '', $marc['b1'])))); $atomic['title'][] = $temp; $tags[] = 'scrib:title='. trim($temp); $atomic['attribution'][] = $marc['c1']; }else if($lineray[0] == 240){ $marc = self::parse_marc_row($lineray); $temp = trim(ereg_replace('/$', '', $marc['a1'] .' '. $marc['b1'])); $atomic['alttitle'][] = $temp; $tags[] = 'scrib:title='. trim($temp); }else if($lineray[0] == 246){ $marc = self::parse_marc_row($lineray); $temp = trim(ereg_replace('/$', '', $marc['a1'] .' '. $marc['b1'])); $atomic['alttitle'][] = $temp; $tags[] = 'scrib:title='. trim($temp); }else if(($lineray[0] > 719) && ($lineray[0] < 741)){ $marc = self::parse_marc_row($lineray); $temp = $marc['a1']; $atomic['alttitle'][] = $marc['a1']; $tags[] = 'scrib:title='. trim($temp); //Dates }else if($lineray[0] == 260){ $marc = self::parse_marc_row($lineray); $temp = str_pad(substr(ereg_replace('[^0-9]', '', $marc['c1']), 0, 4), 4 , '5'); $atomic['pubyear'][] = $temp; }else if($lineray[0] == 5){ $atomic['catdate'][] = $line{7}.$line{8}.$line{9}.$line{10} .'-'. $line{11}.$line{12} .'-'. $line{13}.$line{14}; }else if($lineray[0] == 8){ $atomic['pubyear'][] = substr($line, 14, 4); //Subjects }else if(($lineray[0] > 599) && ($lineray[0] < 700)){ $marc = self::parse_marc_row($lineray); $atomic['subject'][] = implode(' -- ', $marc); if($atomic['subjkey']){ $atomic['subjkey'] = array_unique(array_merge($atomic['subjkey'], array_values($marc))); }else{ $atomic['subjkey'] = array_values($marc); } //URLs }else if($lineray[0] == 856){ $marc = self::parse_marc_row($lineray); unset($temp); $temp['href'] = $temp['title'] = str_replace(' ', '', $marc['u1']); if($marc['31']) $temp['title'] = $marc['31']; if($marc['z1']) $temp['title'] = $marc['z1']; $atomic['url'][] = ''. $temp['title'] .''; //Notes }else if(($lineray[0] > 299) && ($lineray[0] < 400)){ $marc = self::parse_marc_row($lineray); $atomic['physdesc'][] = implode(' ', array_values($marc)); }else if(($lineray[0] > 399) && ($lineray[0] < 500)){ $marc = self::parse_marc_row($lineray); $atomic['title'][] = implode("\n", array_values($marc)); }else if(($lineray[0] > 799) && ($lineray[0] < 841)){ $marc = self::parse_marc_row($lineray); $atomic['series'][] = implode("\n", array_values($marc)); }else if(($lineray[0] > 499) && ($lineray[0] < 600)){ $line = substr($line, 7); if($lineray[0] == 504) continue; if($lineray[0] == 505){ $atomic['contents'][] = str_replace(array('> ','> ','> '), '>', '
  • '. str_replace('--', "
  • \n
  • ", trim(str_replace(array(' ', ' ', ' '), ' ', $line))) .'
  • '); continue; } $atomic['notes'][] = str_replace(' ', ' ', $line); //echo "

    Notes:

    $line
    "; } //Format if((!$atomic['format']) && ($lineray[0] > 239) && ($lineray[0] < 246)){ $marc = self::parse_marc_row($lineray); $temp = ucwords(strtolower(str_replace('[', '', str_replace(']', '', $marc['h1'])))); if(eregi('^book', $temp)){ $format = 'Book'; $formats = 'Books'; }else if(eregi('^micr', $temp)){ $format = 'Microform'; }else if(eregi('^electr', $temp)){ $format = 'Website'; $formats = 'Websites'; }else if(eregi('^vid', $temp)){ $format = 'Video'; }else if(eregi('^motion', $temp)){ $format = 'Video'; }else if(eregi('^audi', $temp)){ $format = 'Audio'; }else if(eregi('^cass', $temp)){ $format = 'Audio'; }else if(eregi('^phono', $temp)){ $format = 'Audio'; }else if(eregi('^record', $temp)){ $format = 'Audio'; }else if(eregi('^sound', $temp)){ $format = 'Audio'; }else if(eregi('^carto', $temp)){ $format = 'Map'; $formats = 'Maps'; }else if(eregi('^map', $temp)){ $format = 'Map'; $formats = 'Maps'; }else if(eregi('^globe', $temp)){ $format = 'Map'; $formats = 'Maps'; }else if($temp){ $format = 'Classroom Material'; //$format = $temp; } if(!$formats) $formats = $format; if($format){ $atomic['format'][] = $format; $tags[] = 'scrib:format='. trim($formats); } } // print_r($lineray); } if(!$atomic['format'][0]){ $atomic['format'][0] = 'Book'; $tags[] = 'scrib:format=Books'; } if(!$atomic['acqdate']) $atomic['acqdate'] = $atomic['catdate']; if(!$atomic['catdate'][0]) $atomic['catdate'][0] = '1984-01-01'; if($atomic['pubyear'][0] > (date(Y) + 5)) $atomic['pubyear'][0] = substr($atomic['catdate'][0],0,4); if($atomic['pubyear'][0]){ $atomic['pubdate'] = $atomic['pubyear'][0].substr($atomic['catdate'][0],4); $tags[] = 'scrib:pubyear='. $atomic['pubyear'][0]; } foreach($atomic['isbn'] as $temp){ $tags[] = 'scrib:isbn='. trim($temp); } if($atomic['alttitle']) $atomic['title'] = array_unique(array_merge($atomic['title'], $atomic['alttitle'])); foreach($atomic['subjkey'] as $temp){ $tags[] = 'scrib:subject='. trim($temp); } $atomic['tags'] = $tags; // print_r($atomic); return($atomic); } public function scrapeit($host, $bibn){ global $scrib, $wpdb, $scrib_importer, $scrib_templates; // if(!$scrib_importer->post_exists(substr(ereg_replace('[^a-z|0-9]', '', strtolower($_REQUEST['sourceid'])), 0, 2) . $bibn)){ $iiirecord = $this->parse_marc($this->fetchmarc($this->marcurl($host, $bibn))); if($iiirecord['title']){ if($enrichinfo = $scrib_importer->enrich($iiirecord['isbn'])){ //print_r($enrichinfo); $iiirecord['img'] = $enrichinfo['img']; if($enrichinfo['summary']){ echo 's'; $iiirecord['shortdescription'] = $enrichinfo['summary']; } } $iiirecord['the_title'] = $iiirecord['title'][0]; $iiirecord['the_pubdate'] = $iiirecord['pubdate'][0]; $iiirecord['the_acqdate'] = $iiirecord['acqdate'][0]; $iiirecord['the_sourceid'] = substr(ereg_replace('[^a-z|0-9]', '', strtolower($_REQUEST['sourceid'])), 0, 2) . $bibn; $iiirecord['tags'][] = 'scrib:sourceid='. $iiirecord['the_sourceid']; $iiirecord['the_excerpt'] = $scrib_templates->summary($iiirecord); $iiirecord['the_content'] = $scrib_templates->full($iiirecord); //print_r($iiirecord); // $scrib_importer->insertpost($iiirecord); flush(); // if(($iiirecord['img']['thumb']['url'] && ($bibn < 1200000)) || (!$scrib_importer->post_exists(substr(ereg_replace('[^a-z|0-9]', '', strtolower($_REQUEST['sourceid'])), 0, 2) . $bibn))){ // $scrib_importer->insertpost($iiirecord); $scrib_importer->insertharvest($iiirecord); flush(); // }else{ // echo "

    ignored $bibn, already in catalog.

    "; // } } } } if($_REQUEST['sourceid'] && $_REQUEST['host'] && $_REQUEST['bibn']){ $scrib_III_scraper = new Scrib_III_scraper; $scrib_importer = new Scrib_importer; $scrib_templates = new Scrib_default_templates; if($_REQUEST['to'] && $_REQUEST['bibn'] <= $_REQUEST['to']){ for ($i = $_REQUEST['bibn']; $i < $_REQUEST['bibn'] + 25; $i++) { $scrib_III_scraper->scrapeit($_REQUEST['host'], $i); } $nextbibn = $i; echo ' '; }else{ define('SCRIB_DEBUG', true); $scrib_III_scraper->scrapeit($_REQUEST['host'], $_REQUEST['bibn']); } $prefs['lasthost'] = $_REQUEST['host']; $prefs['lastbibn'] = $_REQUEST['bibn']; $prefs['lastsourceid'] = $_REQUEST['sourceid']; update_option('scrib_IIIimporter', $prefs); }else{ $prefs = get_option('scrib_IIIimporter'); ?>

    example: lola.plymouth.edu (no http://)

    example: 1069003 (the first (or only) record to process)

    example: 1070000 (the last record)

    example: bb (must be two characters, a-z and 0-9 accepted)

    scrape records directly from your III web OPAC.