#!/usr/bin/perl
#
# ImportCitations : attempt to extract bibliographic references from a
# variety of file formats
#
# Author: J.P.Knight@lboro.ac.uk
#
=head1 ImportCitations
A CGI script that takes a document and tries to extract citations from
it.
=cut
use strict;
use CGI;
use Data::Dumper;
use Algorithm::Diff;
use Archive::Any;
use XML::Simple;
use JSON;
use IO::File;
use ZOOM;
$| = 1;
use constant ERR_UNKNOWN_ERROR => 0;
use constant ERR_SUCCESS => 1;
use constant ERR_MISSING_PARAMETER => 2;
use constant ERR_NO_CONVERSION_MAPPING => 3;
binmode STDERR, ":encoding(utf8)";
=head2 Z39.50 parameters - change these for your local set up:
=head3 Z3950Hostname
Returns the host that should be used for Z39.50 queries.
=cut
my $Z3950Hostname = "library.ox.ac.uk";
=head3 Z3950Port
Returns the port that should be used for Z39.50 queries.
=cut
my $Z3950Port = 210;
=head3 Z3950DBName
Returns the database name that should be used for Z39.50 queries.
=cut
my $Z3950DBName = "ALEPH";
=head3 Z3950Username
Returns the username that should be used for Z39.50 queries.
=cut
my $Z3950Username = '';
=head3 Z3950Password
Returns the password that should be used for Z39.50 queries.
=cut
my $Z3950Password = '';
=head3 MaxZ3950Matches
Returns the maximum number of matches to return in one go in Z39.50 queries.
=cut
my $MaxZ3950Matches = 10;
=head3 Z3950ConnectionLifespan
How many Z39.50 queries should be made over a Net::Z3950 connection before
it is closed and new one started? This is sometimes needed because remote
Z39.50 targets fall over if they get lots of different queries on a single
connection.
=cut
my $Z3950ConnectionLifespan = 50;
my $q = new CGI;
my $format = $q->param('format') || 'RIS';
my $callback = $q->param('callback') || 0;
my $debug = $q->param('debug') || 0;
if(!$q->param('file')) {
SearchForm();
} else {
my $returns = {};
my $code = ERR_UNKNOWN_ERROR; # Default to something odd!
($code,$returns) =
ImportCitations(
'file_handle' => $q->upload('file') || '',
'debug' => $debug,
);
if($format eq 'JSON') {
OutputJSON(
code => $code,
returns => $returns,
callback => $callback,
);
} else {
# Default to outputing in RIS format
OutputRIS(
code => $code,
returns => $returns,
);
}
}
exit;
###########################################################################
# Subroutines for the actual operation
sub ImportCitations {
my $params;
%{$params} = @_;
my $return_code = 0;
my $results = {};
my $fh = $params->{'file_handle'};
my $debug = $params->{'debug'} || 0;
($return_code,$results) =
DocxProcess(
'file_handle' => $params->{'file_handle'},
'debug' => $debug,
);
$results->{'import_format'} = 'Docx';
return($return_code,$results);
}
sub DocxProcess {
my $params;
%{$params} = @_;
my $return_code = ERR_UNKNOWN_ERROR;
my $results = {};
my $fh = $params->{'file_handle'};
my $debug = $params->{'debug'} || 0;
if(!$fh) {
$return_code = ERR_MISSING_PARAMETER;
return($return_code, $results);
}
my $file = "/tmp/$$.docx";
open(TMP, ">$file");
while (<$fh>) {
print TMP $_;
}
close TMP;
my $archive = Archive::Any->new($file, 'ZIP');
my $tmpdir = "/tmp/extract.$$";
mkdir $tmpdir;
$archive->extract($tmpdir);
# create object
my $xml = new XML::Simple;
# read XML file
my $data = $xml->XMLin("$tmpdir/word/document.xml");
if(ref($data->{'w:body'}->{'w:p'}) ne 'ARRAY') {
# Empty or malformed document - ignore the whole thing
$return_code = ERR_NO_CONVERSION_MAPPING;
return($return_code,$results);
}
my @paragraphs = @{$data->{'w:body'}->{'w:p'}};
my @works;
my $lastline = '';
open(DMP, ">>/tmp/dump.$$");
binmode DMP, ":encoding(utf8)";
foreach my $parag (@paragraphs) {
print DMP Dumper($parag);
my $line = '';
my $has_hyperlink = 0;
if(ref($parag->{'w:r'}) ne 'ARRAY') {
if($parag->{'w:hyperlink'}->{'w:r'}->{'w:t'}) {
my $thisurl = $parag->{'w:hyperlink'}->{'w:r'}->{'w:t'};
$has_hyperlink = 1;
if($thisurl !~ /^\w+:\/\//) {
$thisurl = 'http://'.$thisurl;
}
$line = $lastline . ' '. $thisurl;
} else {
if(ref($parag->{'w:r'}->{'w:t'}) eq 'HASH') {
if($parag->{'w:r'}->{'w:t'}->{'content'}) {
$line .= $parag->{'w:r'}->{'w:t'}->{'content'};
}
if($parag->{'w:r'}->{'w:t'}->{'xml:space'} &&
$parag->{'w:r'}->{'w:t'}->{'xml:space'} eq 'preserve') {
$line .= ' ';
}
} else {
$line = $parag->{'w:r'}->{'w:t'};
}
}
} else {
my @linebits = @{$parag->{'w:r'}};
my $italics = 0;
my @url;
foreach my $bit (@linebits) {
if($bit->{'w:rPr'}->{'w:i'}) {
if($italics == 0) {
$line .= '';
}
$italics = 1;
} elsif($italics == 1) {
$line .= '';
$italics = 0;
}
if(ref $bit->{'w:t'} eq 'HASH') {
if($bit->{'w:t'}->{'content'}) {
$line .= $bit->{'w:t'}->{'content'};
}
if($bit->{'w:t'}->{'xml:space'} && $bit->{'w:t'}->{'xml:space'} eq 'preserve') {
$line .= ' ';
}
} else {
$line .= $bit->{'w:t'};
}
}
# Close off duff open italics.
if($italics == 1) {
$line .= '';
$italics = 0;
}
}
if(!$line || $line =~ /^\s*$/) {
next;
}
$lastline = $line;
if($debug) {
print DMP "Looking at line of $line\n";
}
my $guessed = 0;
if($line =~ /^\s*(.+?)\s*\(*(\d\d\d\d)\s*?\[*\d{0,4}\]*\)*(.+)$/) {
# Try to work out a potential book, chapter, etc using Z39.50
my $poss_authors = $1;
my $date = $2;
my $poss_title = $3;
my $num;
$num++ while $poss_authors =~ /\S+/g;
if($num > 15) {
# Lots of words in the author? Sounds like some random
# date in some text to me - make it a public note.
my $this_work = {
'Type' => 'Note',
'details' => {
'notetext' => $line,
}
};
push @works, $this_work;
next;
}
my $this_work;
($guessed, $this_work) = Z3950Guess(poss_authors => $poss_authors,
date => $date,
poss_title => $poss_title,
line => $line,
);
if($guessed) {
push @works, $this_work;
}
}
if(!$guessed) {
if(
($line =~ /^(.+?)\.\s*\(*(\d\d\d\d)\)*\.\s*(.+?)\.\s*(.+?):\s*([\w\s\&]*)\.(.*)$/ ||
$line =~ /^(.+?)\.\s*\(*(\d\d\d\d)\)*\.\s*(.+?)\.\s*()\s*([\w\s\&]*)\.(.*)$/ ||
$line =~ /^(.+)\s+\(*(\d\d\d\d)\)*\.*\s*\\s*(.+)\s*\<\/i\>\s*(.+?):\s*([\w\s\&]*)\.*(.*)$/ ||
$line =~ /^(.+)\s+\(*(\d\d\d\d)\)*\.*\s*\\s*(.+)\s*\<\/i\>,*\s*()\s*([\w\s\&]*)\.*(.*)$/ ||
$line =~ /^(.+)\.*\s+\(*(\d\d\d\d)\)*[\.,]\s+(.+)[\.,]\s+([A-Za-z].+):(.+).(.*)$/) &&
WordCount($1) < 10 &&
WordCount($3) < 10
) {
my $authors = $1;
my $date = $2;
my $title = $3;
my $where = $4;
my $publisher = $5;
my $rest = $6;
$title =~ s/^[\s,\.\)]+//;
$title =~ s///g;
$title =~ s/<\/i>//g;
$where =~ s/[^\w\s]//g;
$where =~ s/^\s+//;
$rest =~ s/^\s+//;
# Book with proper publisher location
my $this_work = {
'Type' => 'Book',
'details' => {
'authors' => $authors,
'title' => $title,
'pubdate' => $date,
'pubwhere' => $where,
'publisher' => $publisher,
'other_info' => "$rest (Original citation: $line)",
}
};
push @works, $this_work;
} elsif(
($line =~ /^(.+),\s+(.+),\s+(In)*\s+(.+ \(eds\))\s+(.+).\s+(.+),\s+(\d\d\d\d),\s+(.+).()/i ||
$line =~ /^(.+)\.\s+(.+)\s*(In)*:\s*(.+)(.+)<\/i\>\.*\s*(.*)(\d\d\d\d)[\.\s]*(pp[\.\s]*[\d\s\-]+)(.*)$/i) &&
WordCount($1) < 10 &&
WordCount($2) < 10 &&
WordCount($4) < 10 &&
WordCount($5) < 10
) {
# BOOK CHAPTER
my $partauthors = $1 . '.';
my $parttitle = $2;
my $authors = $4;
my $title = $5;
my $publisher = $6;
my $date = $7;
my $pages = $8;
my $rest = $9;
my $num;
$num++ while $partauthors =~ /\S+/g;
if($num > 15) {
# Lots of words in the author? Sounds like some random
# date in some text to me - make it a public note.
my $this_work = {
'Type' => 'Note',
'details' => {
'notetext' => $line,
}
};
push @works, $this_work;
next;
}
$title =~ s/^[\s,\.\)]+//;
$title =~ s///g;
$title =~ s/<\/i>//g;
$rest =~ s/^\s+//;
my $this_work = {
'Type' => 'Book Chapter',
'details' => {
'authors' => $authors,
'title' => $title,
'part_authors' => $partauthors,
'part_title' => $parttitle,
'pubdate' => $date,
'publisher' => $publisher,
'other_info' => "$rest (Original citation: $line)",
}
};
push @works, $this_work;
} elsif(
$line =~ /^(.+),*\s+\(*(\d\d\d\d)\)*(\s+\(ed\.*\))*,*\s+(.+),\s+(.*[a-z].+),\s+(.+)/ &&
WordCount($1) < 10 &&
WordCount($3) < 10 &&
WordCount($4) < 10
) {
# Book chapter in some weirdo Sociology format that they call
# Harvard, but which isn't like anything else we've seen.
my $partauthors = '';
my $parttitle = '';
my $authors = $1;
my $date = $2;
$authors .= $3;
my $title = $4;
my $publisher = $5;
my $rest = $6;
$title =~ s/^[\s,\.\)]+//;
$title =~ s///g;
$title =~ s/<\/i>//g;
$rest =~ s/^\s+//;
my $this_work = {
'Type' => 'Book Chapter',
'details' => {
'authors' => $authors,
'title' => $title,
'part_authors' => $partauthors,
'part_title' => $parttitle,
'pubdate' => $date,
'publisher' => $publisher,
'other_info' => "$rest (Original citation: $line)",
}
};
push @works, $this_work;
} elsif(
$line =~ /^(.+)\s+\(*(\d\d\d\d)\)*\.*\s*(.+)\s*\\s*(.+)\s*\<\/i\>[\s,.]*(\d+)\s*\((\d+)\)[:,]*\s*(\d+[\-\x{2013}]\d+).*$/ ||
$line =~ /^(.+)\s+\(*\(()\)\)*\.*\s*(.+)\s*\\s*(.+)\s*\<\/i\>[\s,.]*(\d+)\s*\((\d+)\)[:,]*\s*(\d+[\-\x{2013}]\d+).*$/ ||
$line =~ /^(.+)\s+\(*(\d\d\d\d)\)*\.*\s*(.+)\s*\\s*(.+)\s*\<\/i\>[\s,.]*\s*Vol\s*(\d+),*\s*Issue\s*(\d+)\s*[:,]*\s*(\d+[\-\x{2013}]\d+).*$/ ||
$line =~ /^(.+)\s+\(*()\)*\.*\s*(.+)\s*\\s*(.+)\s*\<\/i\>[\s,.]*\s*Vol\s*(\d+),*\s*Issue\s*(\d+)\s*[:,]*\s*(\d+[\-\x{2013}]\d+).*$/ ||
$line =~ /^(.+)\s+\(*(\d\d\d\d)\)*\.*\s+(.+)\s*[\.,]\s+(.+)[\.,]*\s(\d+)\s*\((\d+)\)[\:\.,]\s+(.+)/ ||
$line =~ /^(.+)\s+\((\d{4})\)\s+[^[:ascii:]]+(.+)[^[:ascii:]]+\s+(.+)\s+(.+)\((.+)\):*\s+(.+)/ ||
$line =~ /^(.+)\s+\((\d{4})\)\s+[^[:ascii:]]+(.+)[^[:ascii:]]+\s+(.+)\,\s+(\d+)\:(\d+),*\s+(.+)/ ||
$line =~ /^(.+)\s+\((\d{4})\)\s+\'(.+)\'\s+(.+)\s+(\d+)\((\d+)\)\:\s+(.+)/ ||
$line =~ /^(.+)\s+\((\d{4})\)\s+[^[:ascii:]]+(.+)[^[:ascii:]]+\s+(.+)\,*\s+(\d+)\:(\d*),*\s+(.+)/
) {
# Journal article
my $authors = $1;
my $date = $2 || 'No date';
my $title = $3;
my $journal = $4;
my $volume = $5;
my $issue = $6;
my $pages = $7;
my $this_work = {
'Type' => 'Journal Article',
'details' => {
'authors' => $authors,
'title' => $title,
'pubdate' => $date,
'journal' => $journal,
'volume' => $volume,
'issue' => $issue,
'pages' => $pages,
}
};
push @works, $this_work;
} elsif(
($line =~ /^(.*)\s*(http:\/\/.+)(\".*)$/) ||
($line =~ /^(.*)\s*(https:\/\/.+)(\".*)$/) ||
($line =~ /^(.*)\s*(ftp:\/\/.+)(\".*)$/) ||
($line =~ /^(.*)\s*(http:\/\/.+)\s*(.*)$/) ||
($line =~ /^(.*)\s*(https:\/\/.+)\s*(.*)$/) ||
($line =~ /^(.*)\s*(ftp:\/\/.+)\s*(.*)$/) ||
($line =~ /^(.*)\s+(www.[^\s]+)(.*)$/)
) {
# An electronic resource
my $prelude = $1;
my $url = $2;
my $rest = $3;
my $this_work = {
'Type' => 'Electronic Resource',
'details' => {
'prelude' => $prelude,
'url' => $url,
'rest' => $rest,
}
};
push @works, $this_work;
} elsif($line !~ /^\s*Note/i &&
(scalar(@_ = split(/\s+/,$line)) < 7)) {
my $this_work = {
'Type' => 'Sub-Heading',
'details' => {
'subheading' => "$line",
}
};
push @works, $this_work;
} else {
if($debug) {
print DMP "Unmatched LINE: $line\n";
warn "Unmatched LINE: $line\n";
}
my $this_work = {
'Type' => 'Note',
'details' => {
'notetext' => $line,
}
};
push @works, $this_work;
}
}
}
close DMP;
$return_code = ERR_SUCCESS;
@{$results->{'works'}} = @works;
return($return_code,$results);
}
sub Z3950Guess {
my $params;
%$params = @_;
my $date = $params->{'date'} || return 0;
my $poss_authors = $params->{'poss_authors'} || '';
my $poss_title = $params->{'poss_title'} || '';
my $line = $params->{'line'} || '';
my $depth = $params->{'depth'} || 0;
my $guessed = 0;
my $conn; # Z39.50 connection
my $zconnlife = $Z3950ConnectionLifespan;
my $conncount = -1;
my $sutype = 'Book';
my $quotes = 0;
my $seenstop = 0;
my $seencomma = 0;
my $processed_title = '';
my $parttitle = '';
my $partauthors = '';
my $original_date = '';
my $original_publisher = '';
# Some things have weird, broken dates with things in brackets. Lets
# remove them from the title.
if($poss_title =~ /^\s*(\d{4})\]\s+(.+)/) {
$original_date = $1;
$poss_title = $2;
}
# Try to cut out publisher from the title
if($poss_title =~ /(.+?)\.{0,1}\s+([\w\s]+\:\s+[\w\s]+)\.{0,1}\s*$/ ||
$poss_title =~ /^(.+?)\s+([A-Za-z\-]+ [A-Z][A-Z])\:/ ||
$poss_title =~ /^(.+?)\s+([A-Za-z\-]+, [A-Z][A-Z])\:/ ||
$poss_title =~ /^(.+?)\s+([A-Za-z\-])+\:/ ||
$poss_title =~ /^(.+?)\s(University of .+ Press)/ ||
$poss_title =~ /^(.+?)\s([A-Za-z\-]+ University Press)/
) {
$poss_title = $1;
$original_publisher = $2;
if($original_publisher =~ /^York/ &&
$poss_title =~ /New$/) {
$original_publisher = 'New ' . $original_publisher;
$poss_title =~ s/New$//;
}
if($debug) {
warn "Hopefully lopped publisher from title: $poss_title\n";
}
}
$poss_title =~ s/^\s*[\,\.]\s+//;
$poss_title =~ s/^\s+//;
# Try to remove DDC class marks from "titles" for completely bogus
# referencing styles
my $poss_classmark = '';
if($poss_title =~ /(\d{3}\.\d+\/?[A-Z]{0,3}).*/) {
$poss_classmark = $1;
if($debug) {
warn "Z39.50 Class mark found: $poss_classmark\n";
}
$poss_title =~ s/(\d{3}\.\d+\/?[A-Z]{0,3}).*//g;
} elsif($line =~ /(\d{3}\.\d+\/?[A-Z]{0,3}).*/) {
$poss_classmark = $1;
if($debug) {
warn "Z39.50 Class mark found: $poss_classmark\n";
}
}
# Try to spot titles that are in weird Unicode quote marks.
if($poss_title =~ /\xE2\x80\x98/) {
if($debug) {
warn "Has crappy Unicode in title.\n";
}
}
$poss_authors =~ s/\s&\s/ and /g;
# Try to spot book chapters that have sneaked in as titles
if($poss_title =~ /^(.+) in (.+) \(eds\)\s+(.+)/) {
$partauthors = $poss_authors;
$parttitle = $1;
$poss_authors = $2;
$poss_title = $3;
if($debug) {
warn "Looks like this is a book chapter from '$poss_title' edited by $poss_authors\n";
}
$sutype = "Book Chapter";
}
if($debug) {
warn "Z39.50 Guessing with $poss_authors * $date * $poss_title\n";
}
my @titwords = split(/ /, $poss_title);
while(my $titword = shift @titwords) {
if($debug) {
warn "titword = $titword\n" if($poss_authors =~ /Webster/);
}
if($titword =~ /^[\'\"]/) {
$quotes++;
$processed_title .= ' ' . $titword;
next;
}
if($titword =~ /[\'\"]$/) {
$quotes--;
$processed_title .= ' ' . $titword;
next;
}
if($titword =~ /[\'\"]\.$/) {
$quotes--;
$seenstop = 1;
$processed_title .= ' ' . $titword;
next;
}
if(!$quotes && $titword =~ /\.$/) {
$seenstop = 1;
$processed_title .= ' ' . $titword;
next;
}
if($titword =~ /[\'\"]\,$/) {
$quotes--;
$seencomma = 1;
$processed_title .= ' ' . $titword;
next;
}
if(!$quotes && $titword =~ /\,$/) {
$seencomma = 1;
if($debug) {
warn "We've seen a comma!\n";
}
$processed_title .= ' ' . $titword;
next;
}
if($seenstop && lc($titword) eq 'in') {
$partauthors = $poss_authors;
$parttitle = $processed_title;
$sutype = 'Book Chapter';
my $rest = join(' ', @titwords);
$rest =~ /(.+),\s+(.+)/;
$poss_authors = $1;
$poss_title = $2;
$poss_title =~ s/[^\w\s]//g;
last;
}
if($seenstop) {
$poss_title = $processed_title;
$poss_title =~ s/[^\w\s]//g;
last;
}
if($seencomma && $titword =~ /\:/) {
$poss_title = $processed_title;
if($debug) {
warn "Seen a colon - setting poss_title to $poss_title\n";
}
last;
}
$processed_title .= ' ' . $titword;
}
$poss_title =~ s/-/ /g;
$poss_title =~ s/\N{U+2018}/\'/g;
$poss_title =~ s/\N{U+2019}/\'/g;
$poss_title =~ s/\N{U+201c}/\"/g;
$poss_title =~ s/\N{U+201d}/\"/g;
$poss_title =~ s/[^\w\s\']//g;
my $longestword = "";
my $nextlongestword = "";
foreach my $titword (split(/[\s,.;:\(\)]/, $poss_title)) {
next if(lc($titword) eq 'or');
next if(lc($titword) eq 'and');
next if(lc($titword) eq 'not');
next if($titword =~ /\'/);
next if($titword =~ /^\d+$/);
if(length($titword) >= length($longestword)) {
$nextlongestword = $longestword;
$longestword = lc($titword);
} elsif(length($titword) > length($nextlongestword)) {
$nextlongestword = lc($titword);
}
}
$nextlongestword = $longestword if($nextlongestword eq "");
if($conncount > $zconnlife || $conncount == -1 || !defined($conn)) {
$conn =
new ZOOM::Connection($Z3950Hostname,
$Z3950Port,
databaseName => $Z3950DBName,
user => $Z3950Username,
password => $Z3950Password,
);
$conn->option(preferredRecordSyntax => 'usmarc');
$conncount = 0;
}
my $termcount = 0;
my $this_zsearch = '';
my $zsearch_count = 0;
my @zsearch;
my $zconj = '';
if($poss_authors) {
my $longestauthor = '';
foreach my $authword (split(/[\s,.;:\(\)]/, $poss_authors)) {
next if($authword =~ /^and$/i);
next if($authword =~ /^\&$/i);
if(length($authword) >= length($longestauthor)) {
$longestauthor = lc($authword);
}
}
$this_zsearch .= '@attr 1=1003 '.$longestauthor. ' ';
$termcount++;
}
if($longestword) {
if($longestword =~ /s$/) {
$this_zsearch .= "\@or \@attr 1=4 $longestword ";
$longestword =~ s/s$//;
$this_zsearch .= "\@attr 1=4 $longestword ";
} else {
$this_zsearch .= "\@attr 1=4 $longestword ";
}
$termcount++;
$zconj .= '@and ';
}
if($nextlongestword) {
if($nextlongestword =~ /s$/) {
$this_zsearch .= " \@or \@attr 1=4 $nextlongestword ";
$nextlongestword =~ s/s$//;
$this_zsearch .= "\@attr 1=4 $nextlongestword ";
} else {
$this_zsearch .= "\@attr 1=4 $nextlongestword ";
}
$termcount++;
$zconj .= '@and ';
}
if($this_zsearch ne '') {
my $no_date_zsearch = $zconj . $this_zsearch;
$this_zsearch = '@and ' . $zconj . '@attr 1=31 ' .
$date . ' ' . $this_zsearch;
$zsearch[$zsearch_count++] =
{
'type' => 'primary',
'search' => $this_zsearch,
};
if($debug) {
warn "Zsearch = $this_zsearch\n";
}
# One year on
$zsearch[$zsearch_count]->{'type'} = 'yearplus1';
$zsearch[$zsearch_count]->{'search'} = $this_zsearch;
my $date1 = $date + 1;
$zsearch[$zsearch_count++]->{'search'} =~ s/$date/$date1/g;
# One year before
$zsearch[$zsearch_count]->{'type'} = 'yearminus1';
$zsearch[$zsearch_count]->{'search'} = $this_zsearch;
my $date2 = $date - 1;
$zsearch[$zsearch_count++]->{'search'} =~ s/$date/$date2/g;
# Ignore years completely
$zsearch[$zsearch_count++] = {
'type' => 'nopubdate',
'search' => $no_date_zsearch,
};
}
if($poss_classmark) {
$zsearch[$zsearch_count++] = {
'type' => 'classmark',
'search' => "\@attr 1=1016 $poss_classmark",
};
}
return(0, {}) if($zsearch_count == 0);
my $foundauthors = "";
my $foundtitle = "";
my $foundimprint = "";
my $foundcallno = "";
my $foundedition = "";
my $foundpubdate = "";
my $foundisbn = "";
my $authorjoin = "";
my $havefound = 0;
foreach my $this (@zsearch) {
my $rs;
my $thissearch = $this->{'search'};
next if($thissearch eq '');
if($debug) {
warn "Doing thissearch $thissearch\n";
}
eval($rs = $conn->search_pqf($thissearch));
while(!defined($rs)) {
eval($rs = $conn->search_pqf($thissearch));
}
$rs->option(elementSetName => "");
if((defined($rs)) && $rs && (my $n = $rs->size() > 0)) {
my $searchsize = $rs->size();
if($searchsize > $MaxZ3950Matches) {
my $maxsize = $MaxZ3950Matches;
$searchsize = $maxsize;
}
foreach my $i (1 .. $searchsize) {
my $rec = $rs->record($i-1);
my $data = $rec->render();
foreach my $line (split(/\n/,$data)) {
if($debug) {
warn "LINE: $line\n";
}
if(
($line =~ /^020\s*\$a\s*([0-9X\-]+)/i) ||
($line =~ /^022\s*\$a\s*([0-9X\-]+)/i) ||
($line =~ /^001\s*([a-z0-9X\-]+)/i) ||
($line =~ /^001\s*(-[a-z0-9X\-]+-*)/i)
) {
$foundisbn = $1;
if($foundisbn =~ /^-/) {
$foundisbn =~ s/^-//;
$foundisbn =~ s/-+$/--/;
}
}
if($line =~ /^100.*\$a([^\$]+)/) {
my $thisauth = $1;
$thisauth =~ s/,\s+$//;
$foundauthors .= $authorjoin.$thisauth;
$authorjoin= ", ";
}
if($line =~ /^110.*\$a([^\$]+)/) {
$foundauthors .= $authorjoin.$1;
$authorjoin= ", ";
if($line =~ /\$b([^\$]+)/) {
$foundauthors .= ": ".$1;
}
}
if($line =~ /^111.*\$a([^\$]+)/) {
$foundauthors .= $authorjoin.$1;
$authorjoin= ", ";
}
if($line =~ /^245.*\$a([^\$]+)/) {
$foundtitle = $1;
$foundtitle =~ s/\/\s*$//;
if($line =~ /\$b([^\$]+)/) {
$foundtitle .= " ".$1;
}
}
if($line =~ /^250\s*\$a([^\$]+)/) {
$foundedition = $1;
}
if($line =~ /^260.*\$c\s*c*(\d\d\d\d)/ ||
$line =~ /^264.*\$c\s*c*(\d\d\d\d)/
) {
$foundpubdate = $1;
if($line =~ /\$b([^\$]+)/) {
$foundimprint = $1;
}
}
if($line =~ /^700.*\$a([^\$]+)/) {
my $thisauth = $1;
$thisauth =~ s/,\s+$//;
$foundauthors .= $authorjoin.$thisauth;
$authorjoin= ", ";
}
}
if($debug) {
warn "Comparing $foundtitle with $longestword and $nextlongestword\n";
warn "And date $foundpubdate against $date\n";
}
if($this->{'type'} eq 'primary' &&
$foundtitle =~ /$longestword/i &&
(!$nextlongestword || $foundtitle =~ /$nextlongestword/i) &&
(!$foundpubdate || $date == $foundpubdate)) {
$havefound = 1;
last;
} elsif($this->{'type'} eq 'yearminus1' &&
$foundtitle =~ /$longestword/i &&
(!$nextlongestword || $foundtitle =~ /$nextlongestword/i) &&
(!$foundpubdate || ($date - 1) == $foundpubdate)) {
$havefound = 1;
last;
} elsif($this->{'type'} eq 'yearplus1' &&
$foundtitle =~ /$longestword/i &&
(!$nextlongestword || $foundtitle =~ /$nextlongestword/i) &&
(!$foundpubdate || ($date + 1) == $foundpubdate)) {
$havefound = 1;
last;
} elsif($this->{'type'} eq 'nopubdate' &&
$foundtitle =~ /$longestword/i &&
(!$nextlongestword || $foundtitle =~ /$nextlongestword/i)) {
$havefound = 1;
last;
} elsif($this->{'type'} eq 'classmark') {
if($debug) {
warn "Classmark match from $searchsize results\n";
}
# $havefound = 1;
# last;
} else {
$foundauthors = "";
$foundtitle = "";
$foundimprint = "";
$foundcallno = "";
$foundedition = "";
$foundpubdate = "";
$foundisbn = "";
$authorjoin = "";
}
}
} else {
if($debug) {
warn "No matches for $thissearch\n";
}
}
last if($havefound);
}
$foundauthors =~ s/^\s+//;
$foundauthors =~ s/\s+$//;
$foundtitle =~ s/^\s+//;
$foundtitle =~ s/\s+$//;
$foundtitle =~ s/\/$//;
$conncount++;
if($debug) {
warn "Foundauthors: $foundauthors\n";
warn "Foundtitle: $foundtitle\n";
warn "Foundpubdate: $foundpubdate\n";
}
my $this_work;
if(($foundauthors || $foundtitle)) {
$this_work = {
'Type' => $sutype,
'details' => {
'authors' => $foundauthors,
'title' => $foundtitle,
'pubdate' => $date,
'publisher' => $foundimprint,
'isbn' => $foundisbn,
'other_info' => "Original citation: $line",
}
};
if($foundedition) {
$this_work->{'detail'}->{'edition'} = $foundedition;
}
if($sutype eq 'Book Chapter') {
if($partauthors) {
$this_work->{'detail'}->{'part_authors'} =
$partauthors;
if($debug) {
warn "Setting part authors to " .
$this_work->{'detail'}->{'part_authors'} . "\n";
}
}
if($parttitle) {
$this_work->{'detail'}->{'part_title'} = $parttitle;
if($debug) {
warn "Setting part title to " .
$this_work->{'detail'}->{'part_title'} . "\n";
}
}
}
$guessed = 1;
if($debug) {
warn Dumper($this_work) . "\n";
}
} else {
if($depth == 0 && $poss_title =~ /ize/) {
$poss_title =~ s/ize/ise/g;
($guessed, $this_work) = Z3950Guess(poss_title => $poss_title,
poss_authors => $poss_authors,
date => $date,
line => $line,
depth => $depth+1,
);
}
if(!$guessed && $depth == 0 && $poss_title =~ /ise/) {
$poss_title=~ s/ise/ize/g;
($guessed, $this_work) = Z3950Guess(poss_title => $poss_title,
poss_authors => $poss_authors,
date => $date,
line => $line,
depth => $depth+1,
);
}
}
return($guessed, $this_work);
}
sub WordCount {
my $text = shift;
if($debug) {
warn "Counting words in $text\n";
}
my @text_words = split(/\s+/, $text);
my $count = scalar(@text_words);
if($debug) {
warn "Found $count words\n";
}
return $count;
}
sub OutputRIS {
my $params;
%$params = @_;
my $returns = $params->{'returns'} || {};
my $code = $params->{'code'} || 0;
print "Content-type: “application/x-research-info-systems;charset=utf-8\n\n";
print "Provider: ImportCitations\r\n";
print "Content: RIS\r\n";
if(ref $returns->{'works'} eq 'ARRAY') {
foreach my $work (@{$returns->{'works'}}) {
if($work->{'Type'} eq 'Book') {
print "TY - BOOK\r\n";
} elsif($work->{'Type'} eq 'Book Chapter') {
print "TY - CHAP\r\n";
} elsif($work->{'Type'} eq 'Journal') {
print "TY - JFULL\r\n";
} elsif($work->{'Type'} eq 'Journal Article') {
print "TY - JOUR\r\n";
} elsif($work->{'Type'} eq 'Electronic Resource') {
print "TY - ELEC\r\n";
}
my $details = $work->{'details'};
if($details->{'authors'}) {
print "AU - " . $details->{'authors'} . "\r\n";
}
if($details->{'title'}) {
print "TI - ". $details->{'title'} . "\r\n";
}
if($details->{'journal'}) {
print "T2 - ". $details->{'journal'} . "\r\n";
}
if($details->{'issue'}) {
print "IS - ". $details->{'issue'} . "\r\n";
}
if($details->{'volume'}) {
print "VL - ". $details->{'volume'} . "\r\n";
}
if($details->{'edition'}) {
print "ET - ". $details->{'edition'} . "\r\n";
}
if($details->{'publisher'}) {
print "PB - ". $details->{'publisher'} . "\r\n";
}
if($details->{'pubwhere'}) {
print "CY - ". $details->{'pubwhere'} . "\r\n";
}
if($details->{'isbn'}) {
print "SN - ". $details->{'isbn'} . "\n";
}
if($details->{'issn'}) {
print "SN - ". $details->{'issn'} . "\n";
}
if($details->{'other_info'}) {
print "OP - ". $details->{'other_info'} . "\n";
}
if($details->{'pages'}) {
# Have to hack pages into start/end pages.
if($details->{'pages'} =~ /(\d+)\-(\d+)/) {
print "SP - $1\r\nEP - $2\r\n";
} elsif($details->{'pages'} =~ /(\d+)/) {
print "SP - $1\r\n";
} else {
print "SP - " . $details->{'pages'} . "\r\n";
}
}
if($details->{'pubdate'}) {
if($details->{'pubdate'} =~ /(\d{4})/) {
my $pubyear = $1;
print "PY - ".sprintf("%04d\n", $pubyear). "\n";
}
}
if($details->{'url'}) {
print "UR - ". $details->{'url'} . "\n";
}
}
}
}
sub OutputJSON {
my $params;
%$params = @_;
my $returns = $params->{'returns'} || {};
my $code = $params->{'code'} || 0;
my $callback = $params->{'callback'} || '';
my $json = JSON->new->allow_nonref;
if($code == ERR_SUCCESS) {
if($callback) {
print "Content-type: application/javascript;charset=utf-8\n\n";
my ($f) = $callback =~ m/([A-Za-z_][0-9A-Za-z_]*)/;
print "$f(".$json->encode($returns).");";
} else {
print "Content-type: application/json\n\n";
print $json->pretty->encode($returns);
}
}
}
sub SearchForm {
print $q->header();
print $q->start_html();
print $q->h1('Citation Importer');
print $q->p('Please supply a file containing citations');
print $q->start_form();
print $q->filefield(
-name => 'file',
-size=>50,
-maxlength=>80,
);
print $q->p('Output format: '.
$q->popup_menu(-name => 'format',
-values => ['RIS', 'JSON'],
-default => 'RIS',
)
);
print $q->p('If JSON output, specify a call back for JSONP (if desired):',
$q->textfield(-name => 'callback'));
print $q->submit();
print $q->end_form();
print $q->end_html();
}