#!/usr/bin/perl # # ImportCitations : attempt to extract bibliographic references from a # variety of file formats # # Author: J.P.Knight@lboro.ac.uk # =head1 ImportCitations A CGI script that takes a document and tries to extract citations from it. =cut use strict; use CGI; use Data::Dumper; use Algorithm::Diff; use Archive::Any; use XML::Simple; use JSON; use IO::File; use ZOOM; $| = 1; use constant ERR_UNKNOWN_ERROR => 0; use constant ERR_SUCCESS => 1; use constant ERR_MISSING_PARAMETER => 2; use constant ERR_NO_CONVERSION_MAPPING => 3; binmode STDERR, ":encoding(utf8)"; =head2 Z39.50 parameters - change these for your local set up: =head3 Z3950Hostname Returns the host that should be used for Z39.50 queries. =cut my $Z3950Hostname = "library.ox.ac.uk"; =head3 Z3950Port Returns the port that should be used for Z39.50 queries. =cut my $Z3950Port = 210; =head3 Z3950DBName Returns the database name that should be used for Z39.50 queries. =cut my $Z3950DBName = "ALEPH"; =head3 Z3950Username Returns the username that should be used for Z39.50 queries. =cut my $Z3950Username = ''; =head3 Z3950Password Returns the password that should be used for Z39.50 queries. =cut my $Z3950Password = ''; =head3 MaxZ3950Matches Returns the maximum number of matches to return in one go in Z39.50 queries. =cut my $MaxZ3950Matches = 10; =head3 Z3950ConnectionLifespan How many Z39.50 queries should be made over a Net::Z3950 connection before it is closed and new one started? This is sometimes needed because remote Z39.50 targets fall over if they get lots of different queries on a single connection. =cut my $Z3950ConnectionLifespan = 50; my $q = new CGI; my $format = $q->param('format') || 'RIS'; my $callback = $q->param('callback') || 0; my $debug = $q->param('debug') || 0; if(!$q->param('file')) { SearchForm(); } else { my $returns = {}; my $code = ERR_UNKNOWN_ERROR; # Default to something odd! ($code,$returns) = ImportCitations( 'file_handle' => $q->upload('file') || '', 'debug' => $debug, ); if($format eq 'JSON') { OutputJSON( code => $code, returns => $returns, callback => $callback, ); } else { # Default to outputing in RIS format OutputRIS( code => $code, returns => $returns, ); } } exit; ########################################################################### # Subroutines for the actual operation sub ImportCitations { my $params; %{$params} = @_; my $return_code = 0; my $results = {}; my $fh = $params->{'file_handle'}; my $debug = $params->{'debug'} || 0; ($return_code,$results) = DocxProcess( 'file_handle' => $params->{'file_handle'}, 'debug' => $debug, ); $results->{'import_format'} = 'Docx'; return($return_code,$results); } sub DocxProcess { my $params; %{$params} = @_; my $return_code = ERR_UNKNOWN_ERROR; my $results = {}; my $fh = $params->{'file_handle'}; my $debug = $params->{'debug'} || 0; if(!$fh) { $return_code = ERR_MISSING_PARAMETER; return($return_code, $results); } my $file = "/tmp/$$.docx"; open(TMP, ">$file"); while (<$fh>) { print TMP $_; } close TMP; my $archive = Archive::Any->new($file, 'ZIP'); my $tmpdir = "/tmp/extract.$$"; mkdir $tmpdir; $archive->extract($tmpdir); # create object my $xml = new XML::Simple; # read XML file my $data = $xml->XMLin("$tmpdir/word/document.xml"); if(ref($data->{'w:body'}->{'w:p'}) ne 'ARRAY') { # Empty or malformed document - ignore the whole thing $return_code = ERR_NO_CONVERSION_MAPPING; return($return_code,$results); } my @paragraphs = @{$data->{'w:body'}->{'w:p'}}; my @works; my $lastline = ''; open(DMP, ">>/tmp/dump.$$"); binmode DMP, ":encoding(utf8)"; foreach my $parag (@paragraphs) { print DMP Dumper($parag); my $line = ''; my $has_hyperlink = 0; if(ref($parag->{'w:r'}) ne 'ARRAY') { if($parag->{'w:hyperlink'}->{'w:r'}->{'w:t'}) { my $thisurl = $parag->{'w:hyperlink'}->{'w:r'}->{'w:t'}; $has_hyperlink = 1; if($thisurl !~ /^\w+:\/\//) { $thisurl = 'http://'.$thisurl; } $line = $lastline . ' '. $thisurl; } else { if(ref($parag->{'w:r'}->{'w:t'}) eq 'HASH') { if($parag->{'w:r'}->{'w:t'}->{'content'}) { $line .= $parag->{'w:r'}->{'w:t'}->{'content'}; } if($parag->{'w:r'}->{'w:t'}->{'xml:space'} && $parag->{'w:r'}->{'w:t'}->{'xml:space'} eq 'preserve') { $line .= ' '; } } else { $line = $parag->{'w:r'}->{'w:t'}; } } } else { my @linebits = @{$parag->{'w:r'}}; my $italics = 0; my @url; foreach my $bit (@linebits) { if($bit->{'w:rPr'}->{'w:i'}) { if($italics == 0) { $line .= ''; } $italics = 1; } elsif($italics == 1) { $line .= ''; $italics = 0; } if(ref $bit->{'w:t'} eq 'HASH') { if($bit->{'w:t'}->{'content'}) { $line .= $bit->{'w:t'}->{'content'}; } if($bit->{'w:t'}->{'xml:space'} && $bit->{'w:t'}->{'xml:space'} eq 'preserve') { $line .= ' '; } } else { $line .= $bit->{'w:t'}; } } # Close off duff open italics. if($italics == 1) { $line .= ''; $italics = 0; } } if(!$line || $line =~ /^\s*$/) { next; } $lastline = $line; if($debug) { print DMP "Looking at line of $line\n"; } my $guessed = 0; if($line =~ /^\s*(.+?)\s*\(*(\d\d\d\d)\s*?\[*\d{0,4}\]*\)*(.+)$/) { # Try to work out a potential book, chapter, etc using Z39.50 my $poss_authors = $1; my $date = $2; my $poss_title = $3; my $num; $num++ while $poss_authors =~ /\S+/g; if($num > 15) { # Lots of words in the author? Sounds like some random # date in some text to me - make it a public note. my $this_work = { 'Type' => 'Note', 'details' => { 'notetext' => $line, } }; push @works, $this_work; next; } my $this_work; ($guessed, $this_work) = Z3950Guess(poss_authors => $poss_authors, date => $date, poss_title => $poss_title, line => $line, ); if($guessed) { push @works, $this_work; } } if(!$guessed) { if( ($line =~ /^(.+?)\.\s*\(*(\d\d\d\d)\)*\.\s*(.+?)\.\s*(.+?):\s*([\w\s\&]*)\.(.*)$/ || $line =~ /^(.+?)\.\s*\(*(\d\d\d\d)\)*\.\s*(.+?)\.\s*()\s*([\w\s\&]*)\.(.*)$/ || $line =~ /^(.+)\s+\(*(\d\d\d\d)\)*\.*\s*\\s*(.+)\s*\<\/i\>\s*(.+?):\s*([\w\s\&]*)\.*(.*)$/ || $line =~ /^(.+)\s+\(*(\d\d\d\d)\)*\.*\s*\\s*(.+)\s*\<\/i\>,*\s*()\s*([\w\s\&]*)\.*(.*)$/ || $line =~ /^(.+)\.*\s+\(*(\d\d\d\d)\)*[\.,]\s+(.+)[\.,]\s+([A-Za-z].+):(.+).(.*)$/) && WordCount($1) < 10 && WordCount($3) < 10 ) { my $authors = $1; my $date = $2; my $title = $3; my $where = $4; my $publisher = $5; my $rest = $6; $title =~ s/^[\s,\.\)]+//; $title =~ s///g; $title =~ s/<\/i>//g; $where =~ s/[^\w\s]//g; $where =~ s/^\s+//; $rest =~ s/^\s+//; # Book with proper publisher location my $this_work = { 'Type' => 'Book', 'details' => { 'authors' => $authors, 'title' => $title, 'pubdate' => $date, 'pubwhere' => $where, 'publisher' => $publisher, 'other_info' => "$rest (Original citation: $line)", } }; push @works, $this_work; } elsif( ($line =~ /^(.+),\s+(.+),\s+(In)*\s+(.+ \(eds\))\s+(.+).\s+(.+),\s+(\d\d\d\d),\s+(.+).()/i || $line =~ /^(.+)\.\s+(.+)\s*(In)*:\s*(.+)(.+)<\/i\>\.*\s*(.*)(\d\d\d\d)[\.\s]*(pp[\.\s]*[\d\s\-]+)(.*)$/i) && WordCount($1) < 10 && WordCount($2) < 10 && WordCount($4) < 10 && WordCount($5) < 10 ) { # BOOK CHAPTER my $partauthors = $1 . '.'; my $parttitle = $2; my $authors = $4; my $title = $5; my $publisher = $6; my $date = $7; my $pages = $8; my $rest = $9; my $num; $num++ while $partauthors =~ /\S+/g; if($num > 15) { # Lots of words in the author? Sounds like some random # date in some text to me - make it a public note. my $this_work = { 'Type' => 'Note', 'details' => { 'notetext' => $line, } }; push @works, $this_work; next; } $title =~ s/^[\s,\.\)]+//; $title =~ s///g; $title =~ s/<\/i>//g; $rest =~ s/^\s+//; my $this_work = { 'Type' => 'Book Chapter', 'details' => { 'authors' => $authors, 'title' => $title, 'part_authors' => $partauthors, 'part_title' => $parttitle, 'pubdate' => $date, 'publisher' => $publisher, 'other_info' => "$rest (Original citation: $line)", } }; push @works, $this_work; } elsif( $line =~ /^(.+),*\s+\(*(\d\d\d\d)\)*(\s+\(ed\.*\))*,*\s+(.+),\s+(.*[a-z].+),\s+(.+)/ && WordCount($1) < 10 && WordCount($3) < 10 && WordCount($4) < 10 ) { # Book chapter in some weirdo Sociology format that they call # Harvard, but which isn't like anything else we've seen. my $partauthors = ''; my $parttitle = ''; my $authors = $1; my $date = $2; $authors .= $3; my $title = $4; my $publisher = $5; my $rest = $6; $title =~ s/^[\s,\.\)]+//; $title =~ s///g; $title =~ s/<\/i>//g; $rest =~ s/^\s+//; my $this_work = { 'Type' => 'Book Chapter', 'details' => { 'authors' => $authors, 'title' => $title, 'part_authors' => $partauthors, 'part_title' => $parttitle, 'pubdate' => $date, 'publisher' => $publisher, 'other_info' => "$rest (Original citation: $line)", } }; push @works, $this_work; } elsif( $line =~ /^(.+)\s+\(*(\d\d\d\d)\)*\.*\s*(.+)\s*\\s*(.+)\s*\<\/i\>[\s,.]*(\d+)\s*\((\d+)\)[:,]*\s*(\d+[\-\x{2013}]\d+).*$/ || $line =~ /^(.+)\s+\(*\(()\)\)*\.*\s*(.+)\s*\\s*(.+)\s*\<\/i\>[\s,.]*(\d+)\s*\((\d+)\)[:,]*\s*(\d+[\-\x{2013}]\d+).*$/ || $line =~ /^(.+)\s+\(*(\d\d\d\d)\)*\.*\s*(.+)\s*\\s*(.+)\s*\<\/i\>[\s,.]*\s*Vol\s*(\d+),*\s*Issue\s*(\d+)\s*[:,]*\s*(\d+[\-\x{2013}]\d+).*$/ || $line =~ /^(.+)\s+\(*()\)*\.*\s*(.+)\s*\\s*(.+)\s*\<\/i\>[\s,.]*\s*Vol\s*(\d+),*\s*Issue\s*(\d+)\s*[:,]*\s*(\d+[\-\x{2013}]\d+).*$/ || $line =~ /^(.+)\s+\(*(\d\d\d\d)\)*\.*\s+(.+)\s*[\.,]\s+(.+)[\.,]*\s(\d+)\s*\((\d+)\)[\:\.,]\s+(.+)/ || $line =~ /^(.+)\s+\((\d{4})\)\s+[^[:ascii:]]+(.+)[^[:ascii:]]+\s+(.+)\s+(.+)\((.+)\):*\s+(.+)/ || $line =~ /^(.+)\s+\((\d{4})\)\s+[^[:ascii:]]+(.+)[^[:ascii:]]+\s+(.+)\,\s+(\d+)\:(\d+),*\s+(.+)/ || $line =~ /^(.+)\s+\((\d{4})\)\s+\'(.+)\'\s+(.+)\s+(\d+)\((\d+)\)\:\s+(.+)/ || $line =~ /^(.+)\s+\((\d{4})\)\s+[^[:ascii:]]+(.+)[^[:ascii:]]+\s+(.+)\,*\s+(\d+)\:(\d*),*\s+(.+)/ ) { # Journal article my $authors = $1; my $date = $2 || 'No date'; my $title = $3; my $journal = $4; my $volume = $5; my $issue = $6; my $pages = $7; my $this_work = { 'Type' => 'Journal Article', 'details' => { 'authors' => $authors, 'title' => $title, 'pubdate' => $date, 'journal' => $journal, 'volume' => $volume, 'issue' => $issue, 'pages' => $pages, } }; push @works, $this_work; } elsif( ($line =~ /^(.*)\s*(http:\/\/.+)(\".*)$/) || ($line =~ /^(.*)\s*(https:\/\/.+)(\".*)$/) || ($line =~ /^(.*)\s*(ftp:\/\/.+)(\".*)$/) || ($line =~ /^(.*)\s*(http:\/\/.+)\s*(.*)$/) || ($line =~ /^(.*)\s*(https:\/\/.+)\s*(.*)$/) || ($line =~ /^(.*)\s*(ftp:\/\/.+)\s*(.*)$/) || ($line =~ /^(.*)\s+(www.[^\s]+)(.*)$/) ) { # An electronic resource my $prelude = $1; my $url = $2; my $rest = $3; my $this_work = { 'Type' => 'Electronic Resource', 'details' => { 'prelude' => $prelude, 'url' => $url, 'rest' => $rest, } }; push @works, $this_work; } elsif($line !~ /^\s*Note/i && (scalar(@_ = split(/\s+/,$line)) < 7)) { my $this_work = { 'Type' => 'Sub-Heading', 'details' => { 'subheading' => "$line", } }; push @works, $this_work; } else { if($debug) { print DMP "Unmatched LINE: $line\n"; warn "Unmatched LINE: $line\n"; } my $this_work = { 'Type' => 'Note', 'details' => { 'notetext' => $line, } }; push @works, $this_work; } } } close DMP; $return_code = ERR_SUCCESS; @{$results->{'works'}} = @works; return($return_code,$results); } sub Z3950Guess { my $params; %$params = @_; my $date = $params->{'date'} || return 0; my $poss_authors = $params->{'poss_authors'} || ''; my $poss_title = $params->{'poss_title'} || ''; my $line = $params->{'line'} || ''; my $depth = $params->{'depth'} || 0; my $guessed = 0; my $conn; # Z39.50 connection my $zconnlife = $Z3950ConnectionLifespan; my $conncount = -1; my $sutype = 'Book'; my $quotes = 0; my $seenstop = 0; my $seencomma = 0; my $processed_title = ''; my $parttitle = ''; my $partauthors = ''; my $original_date = ''; my $original_publisher = ''; # Some things have weird, broken dates with things in brackets. Lets # remove them from the title. if($poss_title =~ /^\s*(\d{4})\]\s+(.+)/) { $original_date = $1; $poss_title = $2; } # Try to cut out publisher from the title if($poss_title =~ /(.+?)\.{0,1}\s+([\w\s]+\:\s+[\w\s]+)\.{0,1}\s*$/ || $poss_title =~ /^(.+?)\s+([A-Za-z\-]+ [A-Z][A-Z])\:/ || $poss_title =~ /^(.+?)\s+([A-Za-z\-]+, [A-Z][A-Z])\:/ || $poss_title =~ /^(.+?)\s+([A-Za-z\-])+\:/ || $poss_title =~ /^(.+?)\s(University of .+ Press)/ || $poss_title =~ /^(.+?)\s([A-Za-z\-]+ University Press)/ ) { $poss_title = $1; $original_publisher = $2; if($original_publisher =~ /^York/ && $poss_title =~ /New$/) { $original_publisher = 'New ' . $original_publisher; $poss_title =~ s/New$//; } if($debug) { warn "Hopefully lopped publisher from title: $poss_title\n"; } } $poss_title =~ s/^\s*[\,\.]\s+//; $poss_title =~ s/^\s+//; # Try to remove DDC class marks from "titles" for completely bogus # referencing styles my $poss_classmark = ''; if($poss_title =~ /(\d{3}\.\d+\/?[A-Z]{0,3}).*/) { $poss_classmark = $1; if($debug) { warn "Z39.50 Class mark found: $poss_classmark\n"; } $poss_title =~ s/(\d{3}\.\d+\/?[A-Z]{0,3}).*//g; } elsif($line =~ /(\d{3}\.\d+\/?[A-Z]{0,3}).*/) { $poss_classmark = $1; if($debug) { warn "Z39.50 Class mark found: $poss_classmark\n"; } } # Try to spot titles that are in weird Unicode quote marks. if($poss_title =~ /\xE2\x80\x98/) { if($debug) { warn "Has crappy Unicode in title.\n"; } } $poss_authors =~ s/\s&\s/ and /g; # Try to spot book chapters that have sneaked in as titles if($poss_title =~ /^(.+) in (.+) \(eds\)\s+(.+)/) { $partauthors = $poss_authors; $parttitle = $1; $poss_authors = $2; $poss_title = $3; if($debug) { warn "Looks like this is a book chapter from '$poss_title' edited by $poss_authors\n"; } $sutype = "Book Chapter"; } if($debug) { warn "Z39.50 Guessing with $poss_authors * $date * $poss_title\n"; } my @titwords = split(/ /, $poss_title); while(my $titword = shift @titwords) { if($debug) { warn "titword = $titword\n" if($poss_authors =~ /Webster/); } if($titword =~ /^[\'\"]/) { $quotes++; $processed_title .= ' ' . $titword; next; } if($titword =~ /[\'\"]$/) { $quotes--; $processed_title .= ' ' . $titword; next; } if($titword =~ /[\'\"]\.$/) { $quotes--; $seenstop = 1; $processed_title .= ' ' . $titword; next; } if(!$quotes && $titword =~ /\.$/) { $seenstop = 1; $processed_title .= ' ' . $titword; next; } if($titword =~ /[\'\"]\,$/) { $quotes--; $seencomma = 1; $processed_title .= ' ' . $titword; next; } if(!$quotes && $titword =~ /\,$/) { $seencomma = 1; if($debug) { warn "We've seen a comma!\n"; } $processed_title .= ' ' . $titword; next; } if($seenstop && lc($titword) eq 'in') { $partauthors = $poss_authors; $parttitle = $processed_title; $sutype = 'Book Chapter'; my $rest = join(' ', @titwords); $rest =~ /(.+),\s+(.+)/; $poss_authors = $1; $poss_title = $2; $poss_title =~ s/[^\w\s]//g; last; } if($seenstop) { $poss_title = $processed_title; $poss_title =~ s/[^\w\s]//g; last; } if($seencomma && $titword =~ /\:/) { $poss_title = $processed_title; if($debug) { warn "Seen a colon - setting poss_title to $poss_title\n"; } last; } $processed_title .= ' ' . $titword; } $poss_title =~ s/-/ /g; $poss_title =~ s/\N{U+2018}/\'/g; $poss_title =~ s/\N{U+2019}/\'/g; $poss_title =~ s/\N{U+201c}/\"/g; $poss_title =~ s/\N{U+201d}/\"/g; $poss_title =~ s/[^\w\s\']//g; my $longestword = ""; my $nextlongestword = ""; foreach my $titword (split(/[\s,.;:\(\)]/, $poss_title)) { next if(lc($titword) eq 'or'); next if(lc($titword) eq 'and'); next if(lc($titword) eq 'not'); next if($titword =~ /\'/); next if($titword =~ /^\d+$/); if(length($titword) >= length($longestword)) { $nextlongestword = $longestword; $longestword = lc($titword); } elsif(length($titword) > length($nextlongestword)) { $nextlongestword = lc($titword); } } $nextlongestword = $longestword if($nextlongestword eq ""); if($conncount > $zconnlife || $conncount == -1 || !defined($conn)) { $conn = new ZOOM::Connection($Z3950Hostname, $Z3950Port, databaseName => $Z3950DBName, user => $Z3950Username, password => $Z3950Password, ); $conn->option(preferredRecordSyntax => 'usmarc'); $conncount = 0; } my $termcount = 0; my $this_zsearch = ''; my $zsearch_count = 0; my @zsearch; my $zconj = ''; if($poss_authors) { my $longestauthor = ''; foreach my $authword (split(/[\s,.;:\(\)]/, $poss_authors)) { next if($authword =~ /^and$/i); next if($authword =~ /^\&$/i); if(length($authword) >= length($longestauthor)) { $longestauthor = lc($authword); } } $this_zsearch .= '@attr 1=1003 '.$longestauthor. ' '; $termcount++; } if($longestword) { if($longestword =~ /s$/) { $this_zsearch .= "\@or \@attr 1=4 $longestword "; $longestword =~ s/s$//; $this_zsearch .= "\@attr 1=4 $longestword "; } else { $this_zsearch .= "\@attr 1=4 $longestword "; } $termcount++; $zconj .= '@and '; } if($nextlongestword) { if($nextlongestword =~ /s$/) { $this_zsearch .= " \@or \@attr 1=4 $nextlongestword "; $nextlongestword =~ s/s$//; $this_zsearch .= "\@attr 1=4 $nextlongestword "; } else { $this_zsearch .= "\@attr 1=4 $nextlongestword "; } $termcount++; $zconj .= '@and '; } if($this_zsearch ne '') { my $no_date_zsearch = $zconj . $this_zsearch; $this_zsearch = '@and ' . $zconj . '@attr 1=31 ' . $date . ' ' . $this_zsearch; $zsearch[$zsearch_count++] = { 'type' => 'primary', 'search' => $this_zsearch, }; if($debug) { warn "Zsearch = $this_zsearch\n"; } # One year on $zsearch[$zsearch_count]->{'type'} = 'yearplus1'; $zsearch[$zsearch_count]->{'search'} = $this_zsearch; my $date1 = $date + 1; $zsearch[$zsearch_count++]->{'search'} =~ s/$date/$date1/g; # One year before $zsearch[$zsearch_count]->{'type'} = 'yearminus1'; $zsearch[$zsearch_count]->{'search'} = $this_zsearch; my $date2 = $date - 1; $zsearch[$zsearch_count++]->{'search'} =~ s/$date/$date2/g; # Ignore years completely $zsearch[$zsearch_count++] = { 'type' => 'nopubdate', 'search' => $no_date_zsearch, }; } if($poss_classmark) { $zsearch[$zsearch_count++] = { 'type' => 'classmark', 'search' => "\@attr 1=1016 $poss_classmark", }; } return(0, {}) if($zsearch_count == 0); my $foundauthors = ""; my $foundtitle = ""; my $foundimprint = ""; my $foundcallno = ""; my $foundedition = ""; my $foundpubdate = ""; my $foundisbn = ""; my $authorjoin = ""; my $havefound = 0; foreach my $this (@zsearch) { my $rs; my $thissearch = $this->{'search'}; next if($thissearch eq ''); if($debug) { warn "Doing thissearch $thissearch\n"; } eval($rs = $conn->search_pqf($thissearch)); while(!defined($rs)) { eval($rs = $conn->search_pqf($thissearch)); } $rs->option(elementSetName => ""); if((defined($rs)) && $rs && (my $n = $rs->size() > 0)) { my $searchsize = $rs->size(); if($searchsize > $MaxZ3950Matches) { my $maxsize = $MaxZ3950Matches; $searchsize = $maxsize; } foreach my $i (1 .. $searchsize) { my $rec = $rs->record($i-1); my $data = $rec->render(); foreach my $line (split(/\n/,$data)) { if($debug) { warn "LINE: $line\n"; } if( ($line =~ /^020\s*\$a\s*([0-9X\-]+)/i) || ($line =~ /^022\s*\$a\s*([0-9X\-]+)/i) || ($line =~ /^001\s*([a-z0-9X\-]+)/i) || ($line =~ /^001\s*(-[a-z0-9X\-]+-*)/i) ) { $foundisbn = $1; if($foundisbn =~ /^-/) { $foundisbn =~ s/^-//; $foundisbn =~ s/-+$/--/; } } if($line =~ /^100.*\$a([^\$]+)/) { my $thisauth = $1; $thisauth =~ s/,\s+$//; $foundauthors .= $authorjoin.$thisauth; $authorjoin= ", "; } if($line =~ /^110.*\$a([^\$]+)/) { $foundauthors .= $authorjoin.$1; $authorjoin= ", "; if($line =~ /\$b([^\$]+)/) { $foundauthors .= ": ".$1; } } if($line =~ /^111.*\$a([^\$]+)/) { $foundauthors .= $authorjoin.$1; $authorjoin= ", "; } if($line =~ /^245.*\$a([^\$]+)/) { $foundtitle = $1; $foundtitle =~ s/\/\s*$//; if($line =~ /\$b([^\$]+)/) { $foundtitle .= " ".$1; } } if($line =~ /^250\s*\$a([^\$]+)/) { $foundedition = $1; } if($line =~ /^260.*\$c\s*c*(\d\d\d\d)/ || $line =~ /^264.*\$c\s*c*(\d\d\d\d)/ ) { $foundpubdate = $1; if($line =~ /\$b([^\$]+)/) { $foundimprint = $1; } } if($line =~ /^700.*\$a([^\$]+)/) { my $thisauth = $1; $thisauth =~ s/,\s+$//; $foundauthors .= $authorjoin.$thisauth; $authorjoin= ", "; } } if($debug) { warn "Comparing $foundtitle with $longestword and $nextlongestword\n"; warn "And date $foundpubdate against $date\n"; } if($this->{'type'} eq 'primary' && $foundtitle =~ /$longestword/i && (!$nextlongestword || $foundtitle =~ /$nextlongestword/i) && (!$foundpubdate || $date == $foundpubdate)) { $havefound = 1; last; } elsif($this->{'type'} eq 'yearminus1' && $foundtitle =~ /$longestword/i && (!$nextlongestword || $foundtitle =~ /$nextlongestword/i) && (!$foundpubdate || ($date - 1) == $foundpubdate)) { $havefound = 1; last; } elsif($this->{'type'} eq 'yearplus1' && $foundtitle =~ /$longestword/i && (!$nextlongestword || $foundtitle =~ /$nextlongestword/i) && (!$foundpubdate || ($date + 1) == $foundpubdate)) { $havefound = 1; last; } elsif($this->{'type'} eq 'nopubdate' && $foundtitle =~ /$longestword/i && (!$nextlongestword || $foundtitle =~ /$nextlongestword/i)) { $havefound = 1; last; } elsif($this->{'type'} eq 'classmark') { if($debug) { warn "Classmark match from $searchsize results\n"; } # $havefound = 1; # last; } else { $foundauthors = ""; $foundtitle = ""; $foundimprint = ""; $foundcallno = ""; $foundedition = ""; $foundpubdate = ""; $foundisbn = ""; $authorjoin = ""; } } } else { if($debug) { warn "No matches for $thissearch\n"; } } last if($havefound); } $foundauthors =~ s/^\s+//; $foundauthors =~ s/\s+$//; $foundtitle =~ s/^\s+//; $foundtitle =~ s/\s+$//; $foundtitle =~ s/\/$//; $conncount++; if($debug) { warn "Foundauthors: $foundauthors\n"; warn "Foundtitle: $foundtitle\n"; warn "Foundpubdate: $foundpubdate\n"; } my $this_work; if(($foundauthors || $foundtitle)) { $this_work = { 'Type' => $sutype, 'details' => { 'authors' => $foundauthors, 'title' => $foundtitle, 'pubdate' => $date, 'publisher' => $foundimprint, 'isbn' => $foundisbn, 'other_info' => "Original citation: $line", } }; if($foundedition) { $this_work->{'detail'}->{'edition'} = $foundedition; } if($sutype eq 'Book Chapter') { if($partauthors) { $this_work->{'detail'}->{'part_authors'} = $partauthors; if($debug) { warn "Setting part authors to " . $this_work->{'detail'}->{'part_authors'} . "\n"; } } if($parttitle) { $this_work->{'detail'}->{'part_title'} = $parttitle; if($debug) { warn "Setting part title to " . $this_work->{'detail'}->{'part_title'} . "\n"; } } } $guessed = 1; if($debug) { warn Dumper($this_work) . "\n"; } } else { if($depth == 0 && $poss_title =~ /ize/) { $poss_title =~ s/ize/ise/g; ($guessed, $this_work) = Z3950Guess(poss_title => $poss_title, poss_authors => $poss_authors, date => $date, line => $line, depth => $depth+1, ); } if(!$guessed && $depth == 0 && $poss_title =~ /ise/) { $poss_title=~ s/ise/ize/g; ($guessed, $this_work) = Z3950Guess(poss_title => $poss_title, poss_authors => $poss_authors, date => $date, line => $line, depth => $depth+1, ); } } return($guessed, $this_work); } sub WordCount { my $text = shift; if($debug) { warn "Counting words in $text\n"; } my @text_words = split(/\s+/, $text); my $count = scalar(@text_words); if($debug) { warn "Found $count words\n"; } return $count; } sub OutputRIS { my $params; %$params = @_; my $returns = $params->{'returns'} || {}; my $code = $params->{'code'} || 0; print "Content-type: “application/x-research-info-systems;charset=utf-8\n\n"; print "Provider: ImportCitations\r\n"; print "Content: RIS\r\n"; if(ref $returns->{'works'} eq 'ARRAY') { foreach my $work (@{$returns->{'works'}}) { if($work->{'Type'} eq 'Book') { print "TY - BOOK\r\n"; } elsif($work->{'Type'} eq 'Book Chapter') { print "TY - CHAP\r\n"; } elsif($work->{'Type'} eq 'Journal') { print "TY - JFULL\r\n"; } elsif($work->{'Type'} eq 'Journal Article') { print "TY - JOUR\r\n"; } elsif($work->{'Type'} eq 'Electronic Resource') { print "TY - ELEC\r\n"; } my $details = $work->{'details'}; if($details->{'authors'}) { print "AU - " . $details->{'authors'} . "\r\n"; } if($details->{'title'}) { print "TI - ". $details->{'title'} . "\r\n"; } if($details->{'journal'}) { print "T2 - ". $details->{'journal'} . "\r\n"; } if($details->{'issue'}) { print "IS - ". $details->{'issue'} . "\r\n"; } if($details->{'volume'}) { print "VL - ". $details->{'volume'} . "\r\n"; } if($details->{'edition'}) { print "ET - ". $details->{'edition'} . "\r\n"; } if($details->{'publisher'}) { print "PB - ". $details->{'publisher'} . "\r\n"; } if($details->{'pubwhere'}) { print "CY - ". $details->{'pubwhere'} . "\r\n"; } if($details->{'isbn'}) { print "SN - ". $details->{'isbn'} . "\n"; } if($details->{'issn'}) { print "SN - ". $details->{'issn'} . "\n"; } if($details->{'other_info'}) { print "OP - ". $details->{'other_info'} . "\n"; } if($details->{'pages'}) { # Have to hack pages into start/end pages. if($details->{'pages'} =~ /(\d+)\-(\d+)/) { print "SP - $1\r\nEP - $2\r\n"; } elsif($details->{'pages'} =~ /(\d+)/) { print "SP - $1\r\n"; } else { print "SP - " . $details->{'pages'} . "\r\n"; } } if($details->{'pubdate'}) { if($details->{'pubdate'} =~ /(\d{4})/) { my $pubyear = $1; print "PY - ".sprintf("%04d\n", $pubyear). "\n"; } } if($details->{'url'}) { print "UR - ". $details->{'url'} . "\n"; } } } } sub OutputJSON { my $params; %$params = @_; my $returns = $params->{'returns'} || {}; my $code = $params->{'code'} || 0; my $callback = $params->{'callback'} || ''; my $json = JSON->new->allow_nonref; if($code == ERR_SUCCESS) { if($callback) { print "Content-type: application/javascript;charset=utf-8\n\n"; my ($f) = $callback =~ m/([A-Za-z_][0-9A-Za-z_]*)/; print "$f(".$json->encode($returns).");"; } else { print "Content-type: application/json\n\n"; print $json->pretty->encode($returns); } } } sub SearchForm { print $q->header(); print $q->start_html(); print $q->h1('Citation Importer'); print $q->p('Please supply a file containing citations'); print $q->start_form(); print $q->filefield( -name => 'file', -size=>50, -maxlength=>80, ); print $q->p('Output format: '. $q->popup_menu(-name => 'format', -values => ['RIS', 'JSON'], -default => 'RIS', ) ); print $q->p('If JSON output, specify a call back for JSONP (if desired):', $q->textfield(-name => 'callback')); print $q->submit(); print $q->end_form(); print $q->end_html(); }