#!/usr/bin/perl
#
# ImportCitations : attempt to extract bibliographic references from a
#                   variety of file formats
#
# Author: J.P.Knight@lboro.ac.uk
#

=head1 ImportCitations

A CGI script that takes a document and tries to extract citations from
it.

=cut

use strict;
use CGI;
use Data::Dumper;
use Algorithm::Diff;
use Archive::Any;
use XML::Simple;
use JSON;
use IO::File;
use ZOOM;
$| = 1;

use constant ERR_UNKNOWN_ERROR => 0;
use constant ERR_SUCCESS => 1;
use constant ERR_MISSING_PARAMETER => 2;
use constant ERR_NO_CONVERSION_MAPPING => 3;

binmode STDERR, ":encoding(utf8)";

=head2 Z39.50 parameters - change these for your local set up:

=head3 Z3950Hostname

Returns the host that should be used for Z39.50 queries.

=cut

my $Z3950Hostname = "library.ox.ac.uk";

=head3 Z3950Port

Returns the port that should be used for Z39.50 queries.

=cut

my $Z3950Port = 210;

=head3 Z3950DBName

Returns the database name that should be used for Z39.50 queries.

=cut

my $Z3950DBName = "ALEPH";

=head3 Z3950Username

Returns the username that should be used for Z39.50 queries.

=cut

my $Z3950Username = '';

=head3 Z3950Password

Returns the password that should be used for Z39.50 queries.

=cut

my $Z3950Password = '';

=head3 MaxZ3950Matches

Returns the maximum number of matches to return in one go in Z39.50 queries.

=cut

my $MaxZ3950Matches = 10;

=head3 Z3950ConnectionLifespan

How many Z39.50 queries should be made over a Net::Z3950 connection before 
it is closed and new one started?  This is sometimes needed because remote
Z39.50 targets fall over if they get lots of different queries on a single
connection.

=cut

my $Z3950ConnectionLifespan = 50;

my $q = new CGI;
my $format = $q->param('format') || 'RIS';
my $callback = $q->param('callback') || 0;
my $debug = $q->param('debug') || 0;

if(!$q->param('file')) {
    SearchForm();
} else {
    my $returns = {};
    my $code = ERR_UNKNOWN_ERROR;  # Default to something odd!
    ($code,$returns) = 
	ImportCitations(
	    'file_handle' => $q->upload('file') || '',
	    'debug' => $debug,
	);
    
    if($format eq 'JSON') {
	OutputJSON(
	    code => $code,
	    returns => $returns,
	    callback => $callback,
	    );
    } else {
	# Default to outputing in RIS format
	OutputRIS(
	    code => $code,
	    returns => $returns,
	    );
    }
}
exit;


###########################################################################
# Subroutines for the actual operation

sub ImportCitations {
    my $params;    
    %{$params} = @_;
    my $return_code = 0;
    my $results = {};

    my $fh = $params->{'file_handle'};
    my $debug = $params->{'debug'} || 0;

    ($return_code,$results) = 
	DocxProcess(
	    'file_handle' => $params->{'file_handle'},
	    'debug' => $debug,
	);
    $results->{'import_format'} = 'Docx';

    return($return_code,$results);
}


sub DocxProcess {
    my $params;    
    %{$params} = @_;
    my $return_code = ERR_UNKNOWN_ERROR;
    my $results = {};
    
    my $fh = $params->{'file_handle'};
    my $debug = $params->{'debug'} || 0;
    
    if(!$fh) {
	$return_code = ERR_MISSING_PARAMETER;
	return($return_code, $results);
    }
    
    my $file = "/tmp/$$.docx";
    open(TMP, ">$file");
    while (<$fh>) {
	print TMP $_;
    }
    close TMP;
    
    my $archive = Archive::Any->new($file, 'ZIP');
    my $tmpdir = "/tmp/extract.$$";
    mkdir $tmpdir;
    $archive->extract($tmpdir);

    # create object
    my $xml = new XML::Simple;
    
    # read XML file
    my $data = $xml->XMLin("$tmpdir/word/document.xml");

    if(ref($data->{'w:body'}->{'w:p'}) ne 'ARRAY') {
	# Empty or malformed document - ignore the whole thing
	$return_code = ERR_NO_CONVERSION_MAPPING;
	return($return_code,$results);
    }
    my @paragraphs = @{$data->{'w:body'}->{'w:p'}};
    my @works;
    my $lastline = '';
    open(DMP, ">>/tmp/dump.$$");
    binmode DMP, ":encoding(utf8)";
    foreach my $parag (@paragraphs) {
	print DMP Dumper($parag);
	my $line = '';
	my $has_hyperlink = 0;
	if(ref($parag->{'w:r'}) ne 'ARRAY') {
	    if($parag->{'w:hyperlink'}->{'w:r'}->{'w:t'}) {
		my $thisurl = $parag->{'w:hyperlink'}->{'w:r'}->{'w:t'};
		$has_hyperlink = 1;
		if($thisurl !~ /^\w+:\/\//) {
		    $thisurl = 'http://'.$thisurl;
		}
		$line = $lastline . ' '. $thisurl;
	    } else {
		if(ref($parag->{'w:r'}->{'w:t'}) eq 'HASH') {
		    if($parag->{'w:r'}->{'w:t'}->{'content'}) {
			$line .= $parag->{'w:r'}->{'w:t'}->{'content'};
		    }
		    if($parag->{'w:r'}->{'w:t'}->{'xml:space'} && 
		       $parag->{'w:r'}->{'w:t'}->{'xml:space'} eq 'preserve') {
			$line .= ' ';
		    }
		} else {
		    $line = $parag->{'w:r'}->{'w:t'};
		}
	    }
	} else {
	    my @linebits = @{$parag->{'w:r'}};
	    my $italics = 0;
	    my @url;
	    foreach my $bit (@linebits) {
		if($bit->{'w:rPr'}->{'w:i'}) {
		    if($italics == 0) {
			$line .= '<i>';
		    }
		    $italics = 1;
		} elsif($italics == 1) {
		    $line .= '</i>';
		    $italics = 0;
		}
		if(ref $bit->{'w:t'} eq 'HASH') {
		    if($bit->{'w:t'}->{'content'}) {
			$line .= $bit->{'w:t'}->{'content'};
		    }
		    if($bit->{'w:t'}->{'xml:space'} && $bit->{'w:t'}->{'xml:space'} eq 'preserve') {
			$line .= ' ';
		    }
		} else {
		    $line .= $bit->{'w:t'};
		}
	    }
	    # Close off duff open italics.
	    if($italics == 1) {
		$line .= '</i>';
		$italics = 0;
	    }
	    
	}
	
	if(!$line || $line =~ /^\s*$/) {
	    next;
	}
	$lastline = $line;

	if($debug) {
	    print DMP "Looking at line of $line\n";
	}
	
	my $guessed = 0;
	if($line =~ /^\s*(.+?)\s*\(*(\d\d\d\d)\s*?\[*\d{0,4}\]*\)*(.+)$/) {
	    # Try to work out a potential book, chapter, etc using Z39.50
	    my $poss_authors = $1;
	    my $date = $2;
	    my $poss_title = $3;

	    my $num; 
	    $num++ while $poss_authors =~ /\S+/g;
	    if($num > 15) {
		# Lots of words in the author? Sounds like some random
		# date in some text to me - make it a public note.
		my $this_work = {
		    'Type' => 'Note',
		    'details' => {
			'notetext' => $line,
		    }
		};
		push @works, $this_work;
		next;
	    }
	    my $this_work;
	    ($guessed, $this_work) = Z3950Guess(poss_authors => $poss_authors,
						date => $date,
						poss_title => $poss_title,
						line => $line,
						);
	    if($guessed) {
		push @works, $this_work;
	    }
	}

	if(!$guessed) {
	    if(
	       ($line =~ /^(.+?)\.\s*\(*(\d\d\d\d)\)*\.\s*(.+?)\.\s*(.+?):\s*([\w\s\&]*)\.(.*)$/ ||
		$line =~ /^(.+?)\.\s*\(*(\d\d\d\d)\)*\.\s*(.+?)\.\s*()\s*([\w\s\&]*)\.(.*)$/ ||
		$line =~ /^(.+)\s+\(*(\d\d\d\d)\)*\.*\s*\<i\>\s*(.+)\s*\<\/i\>\s*(.+?):\s*([\w\s\&]*)\.*(.*)$/ ||
		$line =~ /^(.+)\s+\(*(\d\d\d\d)\)*\.*\s*\<i\>\s*(.+)\s*\<\/i\>,*\s*()\s*([\w\s\&]*)\.*(.*)$/ ||
		$line =~ /^(.+)\.*\s+\(*(\d\d\d\d)\)*[\.,]\s+(.+)[\.,]\s+([A-Za-z].+):(.+).(.*)$/) &&
	       WordCount($1) < 10 &&
	       WordCount($3) < 10
	       ) {
		my $authors = $1;
		my $date = $2;
		my $title = $3;
		my $where = $4;
		my $publisher = $5;
		my $rest = $6;
		
		$title =~ s/^[\s,\.\)]+//;
		$title =~ s/<i>//g;
		$title =~ s/<\/i>//g;
		$where =~ s/[^\w\s]//g;
		$where =~ s/^\s+//;
		$rest =~ s/^\s+//;
		# Book with proper publisher location
		my $this_work = {
		    'Type' => 'Book',
		    'details' => {
			'authors' => $authors,
			'title' => $title,
			'pubdate' => $date,
			'pubwhere' => $where,
			'publisher' => $publisher,
			'other_info' => "$rest (Original citation: $line)",
		    }
		};
		push @works, $this_work;
	    } elsif(
		    ($line =~ /^(.+),\s+(.+),\s+(In)*\s+(.+ \(eds\))\s+(.+).\s+(.+),\s+(\d\d\d\d),\s+(.+).()/i ||
		     $line =~ /^(.+)\.\s+(.+)\s*(In)*:\s*(.+)<i\>(.+)<\/i\>\.*\s*(.*)(\d\d\d\d)[\.\s]*(pp[\.\s]*[\d\s\-]+)(.*)$/i) &&
		    WordCount($1) < 10 && 
		    WordCount($2) < 10 && 
		    WordCount($4) < 10 &&
		    WordCount($5) < 10
		    ) {
		# BOOK CHAPTER
		my $partauthors = $1 . '.';
		my $parttitle = $2;
		my $authors = $4;
		my $title = $5;
		my $publisher = $6;
		my $date = $7;
		my $pages = $8;
		my $rest = $9;

		my $num; 
		$num++ while $partauthors =~ /\S+/g;
		if($num > 15) {
		    # Lots of words in the author? Sounds like some random
		    # date in some text to me - make it a public note.
		    my $this_work = {
			'Type' => 'Note',
			'details' => {
			    'notetext' => $line,
			}
		    };
		    push @works, $this_work;
		    next;
		}
		$title =~ s/^[\s,\.\)]+//;
		$title =~ s/<i>//g;
		$title =~ s/<\/i>//g;
		$rest =~ s/^\s+//;
		my $this_work = {
		    'Type' => 'Book Chapter',
		    'details' => {
			'authors' => $authors,
			'title' => $title,
			'part_authors' => $partauthors,
			'part_title' => $parttitle,
			'pubdate' => $date,
			'publisher' => $publisher,
			'other_info' => "$rest (Original citation: $line)",
		    }
		};
		push @works, $this_work;
	    } elsif(
		    $line =~ /^(.+),*\s+\(*(\d\d\d\d)\)*(\s+\(ed\.*\))*,*\s+(.+),\s+(.*[a-z].+),\s+(.+)/ &&
		    WordCount($1) < 10 && 
		    WordCount($3) < 10 && 
		    WordCount($4) < 10
		    ) {
		# Book chapter in some weirdo Sociology format that they call
		# Harvard, but which isn't like anything else we've seen.
		my $partauthors = '';
		my $parttitle = '';
		my $authors = $1;
		my $date = $2;
		$authors .= $3;
		my $title = $4;
		my $publisher = $5;
		my $rest = $6;
		
		$title =~ s/^[\s,\.\)]+//;
		$title =~ s/<i>//g;
		$title =~ s/<\/i>//g;
		$rest =~ s/^\s+//;
		my $this_work = {
		    'Type' => 'Book Chapter',
		    'details' => {
			'authors' => $authors,
			'title' => $title,
			'part_authors' => $partauthors,
			'part_title' => $parttitle,
			'pubdate' => $date,
			'publisher' => $publisher,
			'other_info' => "$rest (Original citation: $line)",
		    }
		};
		push @works, $this_work;
	    } elsif(
		    $line =~ /^(.+)\s+\(*(\d\d\d\d)\)*\.*\s*(.+)\s*\<i\>\s*(.+)\s*\<\/i\>[\s,.]*(\d+)\s*\((\d+)\)[:,]*\s*(\d+[\-\x{2013}]\d+).*$/ ||
		    $line =~ /^(.+)\s+\(*\(()\)\)*\.*\s*(.+)\s*\<i\>\s*(.+)\s*\<\/i\>[\s,.]*(\d+)\s*\((\d+)\)[:,]*\s*(\d+[\-\x{2013}]\d+).*$/ ||
		    $line =~ /^(.+)\s+\(*(\d\d\d\d)\)*\.*\s*(.+)\s*\<i\>\s*(.+)\s*\<\/i\>[\s,.]*\s*Vol\s*(\d+),*\s*Issue\s*(\d+)\s*[:,]*\s*(\d+[\-\x{2013}]\d+).*$/ ||
		    $line =~ /^(.+)\s+\(*()\)*\.*\s*(.+)\s*\<i\>\s*(.+)\s*\<\/i\>[\s,.]*\s*Vol\s*(\d+),*\s*Issue\s*(\d+)\s*[:,]*\s*(\d+[\-\x{2013}]\d+).*$/ ||
		    $line =~ /^(.+)\s+\(*(\d\d\d\d)\)*\.*\s+(.+)\s*[\.,]\s+(.+)[\.,]*\s(\d+)\s*\((\d+)\)[\:\.,]\s+(.+)/ ||
		    $line =~ /^(.+)\s+\((\d{4})\)\s+[^[:ascii:]]+(.+)[^[:ascii:]]+\s+(.+)\s+(.+)\((.+)\):*\s+(.+)/ ||
		    $line =~ /^(.+)\s+\((\d{4})\)\s+[^[:ascii:]]+(.+)[^[:ascii:]]+\s+(.+)\,\s+(\d+)\:(\d+),*\s+(.+)/ ||
		    $line =~ /^(.+)\s+\((\d{4})\)\s+\'(.+)\'\s+(.+)\s+(\d+)\((\d+)\)\:\s+(.+)/ ||
		    $line =~ /^(.+)\s+\((\d{4})\)\s+[^[:ascii:]]+(.+)[^[:ascii:]]+\s+(.+)\,*\s+(\d+)\:(\d*),*\s+(.+)/
		    ) {
		# Journal article
		my $authors = $1;
		my $date = $2 || 'No date';
		my $title = $3;
		my $journal = $4;
		my $volume = $5;
		my $issue = $6;
		my $pages = $7;
		my $this_work = {
		    'Type' => 'Journal Article',
		    'details' => {
			'authors' => $authors,
			'title' => $title,
			'pubdate' => $date,
			'journal' => $journal,
			'volume' => $volume,
			'issue' => $issue,
			'pages' => $pages,
		    }
		};
		push @works, $this_work;
	    } elsif(
		    ($line =~ /^(.*)\s*(http:\/\/.+)(\".*)$/) ||
		    ($line =~ /^(.*)\s*(https:\/\/.+)(\".*)$/) ||
		    ($line =~ /^(.*)\s*(ftp:\/\/.+)(\".*)$/) ||
		    ($line =~ /^(.*)\s*(http:\/\/.+)\s*(.*)$/) ||
		    ($line =~ /^(.*)\s*(https:\/\/.+)\s*(.*)$/) ||
		    ($line =~ /^(.*)\s*(ftp:\/\/.+)\s*(.*)$/) ||
		    ($line =~ /^(.*)\s+(www.[^\s]+)(.*)$/)
		    ) {
		# An electronic resource
		my $prelude = $1;
		my $url = $2;
		my $rest = $3;
		my $this_work = {
		    'Type' => 'Electronic Resource',
		    'details' => {
			'prelude' => $prelude,
			'url' => $url,
			'rest' => $rest,
		    }
		};
		push @works, $this_work;
	    } elsif($line !~ /^\s*Note/i &&
		    (scalar(@_ = split(/\s+/,$line)) < 7)) {
		my $this_work = {
		    'Type' => 'Sub-Heading',
		    'details' => {
			'subheading' => "$line",
		    }
		};
		push @works, $this_work;
	    } else {
		if($debug) {
		    print DMP "Unmatched LINE: $line\n";
		    warn "Unmatched LINE: $line\n";
		}
		my $this_work = {
		    'Type' => 'Note',
		    'details' => {
			'notetext' => $line,
		    }
		};
		push @works, $this_work;
	    }
	}
    }
    close DMP;
    
    $return_code = ERR_SUCCESS;
    @{$results->{'works'}} = @works;
    return($return_code,$results);
}

sub Z3950Guess {
    my $params;
    %$params = @_;

    my $date = $params->{'date'} || return 0;
    my $poss_authors = $params->{'poss_authors'} || '';
    my $poss_title = $params->{'poss_title'} || '';
    my $line = $params->{'line'} || '';
    my $depth = $params->{'depth'} || 0;

    my $guessed = 0;
    my $conn; # Z39.50 connection
    my $zconnlife = $Z3950ConnectionLifespan;
    my $conncount = -1;

    my $sutype = 'Book';
    my $quotes = 0;
    my $seenstop = 0;
    my $seencomma = 0;
    my $processed_title = '';
    my $parttitle = '';
    my $partauthors = '';
    my $original_date = '';
    my $original_publisher = '';

    # Some things have weird, broken dates with things in brackets. Lets
    # remove them from the title.
    if($poss_title =~ /^\s*(\d{4})\]\s+(.+)/) {
	$original_date = $1;
	$poss_title = $2;
    }

    # Try to cut out publisher from the title
    if($poss_title =~ /(.+?)\.{0,1}\s+([\w\s]+\:\s+[\w\s]+)\.{0,1}\s*$/ ||
       $poss_title =~ /^(.+?)\s+([A-Za-z\-]+ [A-Z][A-Z])\:/ ||
       $poss_title =~ /^(.+?)\s+([A-Za-z\-]+, [A-Z][A-Z])\:/ ||
       $poss_title =~ /^(.+?)\s+([A-Za-z\-])+\:/ ||
       $poss_title =~ /^(.+?)\s(University of .+ Press)/ ||
       $poss_title =~ /^(.+?)\s([A-Za-z\-]+ University Press)/
       ) {
	$poss_title = $1;
	$original_publisher = $2;
	if($original_publisher =~ /^York/ &&
	   $poss_title =~ /New$/) {
	    $original_publisher = 'New ' . $original_publisher;
	    $poss_title =~ s/New$//;
	}
	if($debug) {
	    warn "Hopefully lopped publisher from title: $poss_title\n";
	}
    }
    $poss_title =~ s/^\s*[\,\.]\s+//;
    $poss_title =~ s/^\s+//;

    # Try to remove DDC class marks from "titles" for completely bogus 
    # referencing styles
    my $poss_classmark = '';
    if($poss_title =~ /(\d{3}\.\d+\/?[A-Z]{0,3}).*/) {
	$poss_classmark = $1;
	if($debug) {
	    warn "Z39.50 Class mark found: $poss_classmark\n";
	}
	$poss_title =~ s/(\d{3}\.\d+\/?[A-Z]{0,3}).*//g;
    } elsif($line =~ /(\d{3}\.\d+\/?[A-Z]{0,3}).*/) {
	$poss_classmark = $1;
	if($debug) {
	    warn "Z39.50 Class mark found: $poss_classmark\n";
	}
    }

    # Try to spot titles that are in weird Unicode quote marks.
    if($poss_title =~ /\xE2\x80\x98/) {
	if($debug) {
	    warn "Has crappy Unicode in title.\n";
	}
    }

    $poss_authors =~ s/\s&\s/ and /g;

    # Try to spot book chapters that have sneaked in as titles
    if($poss_title =~ /^(.+) in (.+) \(eds\)\s+(.+)/) {
	$partauthors = $poss_authors;
	$parttitle = $1;
	$poss_authors = $2;
	$poss_title = $3;
	if($debug) {	
	    warn "Looks like this is a book chapter from '$poss_title' edited by $poss_authors\n";
	}
	$sutype = "Book Chapter";
    }

    if($debug) {
	warn "Z39.50 Guessing with $poss_authors * $date * $poss_title\n";
    }

    my @titwords = split(/ /, $poss_title);
    while(my $titword = shift @titwords) {
	if($debug) {
	    warn "titword = $titword\n" if($poss_authors =~ /Webster/);
	}
	if($titword =~ /^[\'\"]/) {
	    $quotes++;
	    $processed_title .= ' ' . $titword;
	    next;
	}
	if($titword =~ /[\'\"]$/) {
	    $quotes--;
	    $processed_title .= ' ' . $titword;
	    next;
	}
	if($titword =~ /[\'\"]\.$/) {
	    $quotes--;
	    $seenstop = 1;
	    $processed_title .= ' ' . $titword;
	    next;
	}
	if(!$quotes && $titword =~ /\.$/) {
	    $seenstop = 1;
	    $processed_title .= ' ' . $titword;
	    next;
	}
	if($titword =~ /[\'\"]\,$/) {
	    $quotes--;
	    $seencomma = 1;
	    $processed_title .= ' ' . $titword;
	    next;
	}
	if(!$quotes && $titword =~ /\,$/) {
	    $seencomma = 1;
	    if($debug) {
		warn "We've seen a comma!\n";
	    }
	    $processed_title .= ' ' . $titword;
	    next;
	}
	if($seenstop && lc($titword) eq 'in') {
	    $partauthors = $poss_authors;
	    $parttitle = $processed_title;
	    $sutype = 'Book Chapter';
	    my $rest = join(' ', @titwords);
	    $rest =~ /(.+),\s+(.+)/;
	    $poss_authors = $1;
	    $poss_title = $2;
	    $poss_title =~ s/[^\w\s]//g;
	    last;
	}
	if($seenstop) {
	    $poss_title = $processed_title;
	    $poss_title =~ s/[^\w\s]//g;
	    last;
	}
	if($seencomma && $titword =~ /\:/) {
	    $poss_title = $processed_title;
	    if($debug) {
		warn "Seen a colon - setting poss_title to $poss_title\n";
	    }
            last;
        }
	
	$processed_title .= ' ' . $titword;
    }
    $poss_title =~ s/-/ /g;
    $poss_title =~ s/\N{U+2018}/\'/g;
    $poss_title =~ s/\N{U+2019}/\'/g;
    $poss_title =~ s/\N{U+201c}/\"/g;
    $poss_title =~ s/\N{U+201d}/\"/g;
    $poss_title =~ s/[^\w\s\']//g;
    my $longestword = "";
    my $nextlongestword = "";
    foreach my $titword (split(/[\s,.;:\(\)]/, $poss_title)) {
	next if(lc($titword) eq 'or');
	next if(lc($titword) eq 'and');
	next if(lc($titword) eq 'not');
        next if($titword =~ /\'/);
        next if($titword =~ /^\d+$/);
	if(length($titword) >= length($longestword)) {
	    $nextlongestword = $longestword;
	    $longestword = lc($titword);
	} elsif(length($titword) > length($nextlongestword)) {
	    $nextlongestword = lc($titword);
	}
    }
    $nextlongestword = $longestword if($nextlongestword eq ""); 
    if($conncount > $zconnlife || $conncount == -1 || !defined($conn)) {
	$conn = 
	    new ZOOM::Connection($Z3950Hostname,
				 $Z3950Port,
				 databaseName => $Z3950DBName,
				 user => $Z3950Username,
				 password => $Z3950Password,
				 );
	$conn->option(preferredRecordSyntax => 'usmarc');
	$conncount = 0;
    }
    
    my $termcount = 0;
    my $this_zsearch = '';
    my $zsearch_count = 0;
    my @zsearch;
    my $zconj = '';
    if($poss_authors) {
	my $longestauthor = '';
	foreach my $authword (split(/[\s,.;:\(\)]/, $poss_authors)) {
	    next if($authword =~ /^and$/i);
	    next if($authword =~ /^\&$/i);
	    if(length($authword) >= length($longestauthor)) {
		$longestauthor = lc($authword);
	    }
	}
	
	$this_zsearch .= '@attr 1=1003 '.$longestauthor. ' ';
	$termcount++;
    }
    if($longestword) {
	if($longestword =~ /s$/) {
	    $this_zsearch .= "\@or \@attr 1=4 $longestword ";
	    $longestword =~ s/s$//;
	    $this_zsearch .= "\@attr 1=4 $longestword ";
	} else {
	    $this_zsearch .= "\@attr 1=4 $longestword ";
	}
	$termcount++;
	$zconj .= '@and ';
    }
    if($nextlongestword) {
	if($nextlongestword =~ /s$/) {
	    $this_zsearch .= " \@or \@attr 1=4 $nextlongestword ";
	    $nextlongestword =~ s/s$//;
	    $this_zsearch .= "\@attr 1=4 $nextlongestword ";
	} else {
	    $this_zsearch .= "\@attr 1=4 $nextlongestword ";
	}
	$termcount++;
	$zconj .= '@and ';
    }
    if($this_zsearch ne '') {
	my $no_date_zsearch = $zconj . $this_zsearch;
	$this_zsearch = '@and ' . $zconj . '@attr 1=31 ' . 
	    $date . ' ' . $this_zsearch;
	$zsearch[$zsearch_count++] = 
	    {
		'type' => 'primary',
		'search' => $this_zsearch,
	    };
	if($debug) {
	    warn "Zsearch = $this_zsearch\n";
	}

	# One year on
	$zsearch[$zsearch_count]->{'type'} = 'yearplus1';
	$zsearch[$zsearch_count]->{'search'} = $this_zsearch;
	my $date1 = $date + 1;
	$zsearch[$zsearch_count++]->{'search'} =~ s/$date/$date1/g;

	# One year before
	$zsearch[$zsearch_count]->{'type'} = 'yearminus1';
	$zsearch[$zsearch_count]->{'search'} = $this_zsearch;
	my $date2 = $date - 1;
	$zsearch[$zsearch_count++]->{'search'} =~ s/$date/$date2/g;

	# Ignore years completely
	$zsearch[$zsearch_count++] = {
	    'type' => 'nopubdate',
	    'search' => $no_date_zsearch,
	};
    }

    if($poss_classmark) {
	$zsearch[$zsearch_count++] = {
	    'type' => 'classmark',
	    'search' => "\@attr 1=1016 $poss_classmark",
	};
    }

    return(0, {}) if($zsearch_count == 0);
    
    my $foundauthors = "";
    my $foundtitle = "";
    my $foundimprint = "";
    my $foundcallno =  "";
    my $foundedition = "";
    my $foundpubdate = "";
    my $foundisbn = "";
    my $authorjoin = "";
    
    my $havefound = 0;
    foreach my $this (@zsearch) {
	my $rs;
	my $thissearch = $this->{'search'};
	next if($thissearch eq '');
	if($debug) {
	    warn "Doing thissearch $thissearch\n";
	}
	eval($rs = $conn->search_pqf($thissearch));
	while(!defined($rs)) {
	    eval($rs = $conn->search_pqf($thissearch));
	}
	$rs->option(elementSetName => "");
	if((defined($rs)) && $rs && (my $n = $rs->size() > 0)) {
	    my $searchsize = $rs->size();
	    if($searchsize > $MaxZ3950Matches) {
		my $maxsize = $MaxZ3950Matches;
		$searchsize = $maxsize;
	    }
	    foreach my $i (1 .. $searchsize) {
		my $rec = $rs->record($i-1);
		my $data = $rec->render();
		foreach my $line (split(/\n/,$data)) {
		    if($debug) {
			warn "LINE: $line\n";
		    }
		    if(
		       ($line =~ /^020\s*\$a\s*([0-9X\-]+)/i) ||
		       ($line =~ /^022\s*\$a\s*([0-9X\-]+)/i) ||
		       ($line =~ /^001\s*([a-z0-9X\-]+)/i) ||
		       ($line =~ /^001\s*(-[a-z0-9X\-]+-*)/i)
		       
		       ) {
			$foundisbn = $1;
			if($foundisbn =~ /^-/) {
			    $foundisbn =~ s/^-//;
			    $foundisbn =~ s/-+$/--/;
			}
		    }
		    if($line =~ /^100.*\$a([^\$]+)/) {
			my $thisauth = $1;
			$thisauth =~ s/,\s+$//;
			$foundauthors .= $authorjoin.$thisauth;
			$authorjoin= ", ";
		    }
		    if($line =~ /^110.*\$a([^\$]+)/) {
			$foundauthors .= $authorjoin.$1;
			$authorjoin= ", ";
			if($line =~ /\$b([^\$]+)/) {
			    $foundauthors .= ": ".$1;
			}
		    }
		    if($line =~ /^111.*\$a([^\$]+)/) {
			$foundauthors .= $authorjoin.$1;
			$authorjoin= ", ";
		    }
		    if($line =~ /^245.*\$a([^\$]+)/) {
			$foundtitle = $1;
			$foundtitle =~ s/\/\s*$//;
			if($line =~ /\$b([^\$]+)/) {
			    $foundtitle .= " ".$1;
			}
		    }
		    if($line =~ /^250\s*\$a([^\$]+)/) {
			$foundedition = $1;
		    }
		    if($line =~ /^260.*\$c\s*c*(\d\d\d\d)/ ||
		       $line =~ /^264.*\$c\s*c*(\d\d\d\d)/
		       ) {
			$foundpubdate = $1;
			if($line =~ /\$b([^\$]+)/) {
			    $foundimprint = $1;
			}
		    }
		    if($line =~ /^700.*\$a([^\$]+)/) {
			my $thisauth = $1;
			$thisauth =~ s/,\s+$//;
			$foundauthors .= $authorjoin.$thisauth;
			$authorjoin= ", ";
		    }
		}
		if($debug) {
		    warn "Comparing $foundtitle with $longestword and $nextlongestword\n";
		    warn "And date $foundpubdate against $date\n";
		}
		if($this->{'type'} eq 'primary' &&
		   $foundtitle =~ /$longestword/i &&
		   (!$nextlongestword || $foundtitle =~ /$nextlongestword/i) &&
		   (!$foundpubdate || $date == $foundpubdate)) {
		    $havefound = 1;
		    last;
		} elsif($this->{'type'} eq 'yearminus1' &&
			$foundtitle =~ /$longestword/i &&
			(!$nextlongestword || $foundtitle =~ /$nextlongestword/i) &&
			(!$foundpubdate || ($date - 1) == $foundpubdate)) {
		    $havefound = 1;
		    last;
		} elsif($this->{'type'} eq 'yearplus1' &&
			$foundtitle =~ /$longestword/i &&
			(!$nextlongestword || $foundtitle =~ /$nextlongestword/i) &&
			(!$foundpubdate || ($date + 1) == $foundpubdate)) {
		    $havefound = 1;
		    last;
		} elsif($this->{'type'} eq 'nopubdate' &&
			$foundtitle =~ /$longestword/i &&
			(!$nextlongestword || $foundtitle =~ /$nextlongestword/i)) {
		    $havefound = 1;
		    last;
		} elsif($this->{'type'} eq 'classmark') {
		    if($debug) {
			warn "Classmark match from $searchsize results\n";
		    }
#		    $havefound = 1;
#		    last;
		} else {
		    $foundauthors = "";
		    $foundtitle = "";
		    $foundimprint = "";
		    $foundcallno = "";
		    $foundedition = "";
		    $foundpubdate = "";
		    $foundisbn = "";
		    $authorjoin = "";
		}
	    }
	} else {
	    if($debug) {
		warn "No matches for $thissearch\n";
	    }
	}
	last if($havefound);
    }
    $foundauthors =~ s/^\s+//;
    $foundauthors =~ s/\s+$//;
    $foundtitle =~ s/^\s+//;
    $foundtitle =~ s/\s+$//;
    $foundtitle =~ s/\/$//;
    $conncount++;
    
    if($debug) {
	warn "Foundauthors: $foundauthors\n";
	warn "Foundtitle:   $foundtitle\n";
	warn "Foundpubdate: $foundpubdate\n";
    }
    
    my $this_work;
    if(($foundauthors || $foundtitle)) {
	$this_work = {
	    'Type' => $sutype,
	    'details' => {
		'authors' => $foundauthors,
		'title' => $foundtitle,
		'pubdate' => $date,
		'publisher' => $foundimprint,
		'isbn' => $foundisbn,
		'other_info' => "Original citation: $line",
	    }
	};
	if($foundedition) {
	    $this_work->{'detail'}->{'edition'} = $foundedition;
	}
	if($sutype eq 'Book Chapter') {
	    if($partauthors) {
		$this_work->{'detail'}->{'part_authors'} = 
		    $partauthors;
		if($debug) {
		    warn "Setting part authors to " . 
			$this_work->{'detail'}->{'part_authors'} . "\n";
		}
	    }
	    if($parttitle) {
		$this_work->{'detail'}->{'part_title'} = $parttitle;
		if($debug) {
		    warn "Setting part title to " . 
			$this_work->{'detail'}->{'part_title'} . "\n";
		}
	    }
	}
	$guessed = 1;
	if($debug) {
	    warn Dumper($this_work) . "\n";
	}
    } else {
	if($depth == 0 && $poss_title =~ /ize/) {
	    $poss_title =~ s/ize/ise/g;
	    ($guessed, $this_work) = Z3950Guess(poss_title => $poss_title,
						poss_authors => $poss_authors,
						date => $date,
						line => $line,
						depth => $depth+1,
                                                );
	}
	if(!$guessed && $depth == 0 && $poss_title =~ /ise/) {
	    $poss_title=~ s/ise/ize/g;
            ($guessed, $this_work) = Z3950Guess(poss_title => $poss_title,
						poss_authors => $poss_authors,
                                                date => $date,
                                                line => $line,
						depth => $depth+1,
                                                );
	}
    }
    return($guessed, $this_work);
}

sub WordCount {
    my $text = shift;
    if($debug) {
	warn "Counting words in $text\n";
    }
    my @text_words = split(/\s+/, $text);
    my $count = scalar(@text_words);
    if($debug) {
	warn "Found $count words\n";
    }
    return $count;
}

sub OutputRIS {
    my $params;
    %$params = @_;

    my $returns = $params->{'returns'} || {};
    my $code = $params->{'code'} || 0;
    

    print "Content-type: “application/x-research-info-systems;charset=utf-8\n\n";
    print "Provider: ImportCitations\r\n";
    print "Content: RIS\r\n";
    if(ref $returns->{'works'} eq 'ARRAY') {
	foreach my $work (@{$returns->{'works'}}) {
	    if($work->{'Type'} eq 'Book') {
		print "TY  - BOOK\r\n";
	    } elsif($work->{'Type'} eq 'Book Chapter') {
		print "TY  - CHAP\r\n";
	    } elsif($work->{'Type'} eq 'Journal') {
		print "TY  - JFULL\r\n";
	    } elsif($work->{'Type'} eq 'Journal Article') {
		print "TY  - JOUR\r\n";
	    } elsif($work->{'Type'} eq 'Electronic Resource') {
		print "TY  - ELEC\r\n";
	    }

	    my $details = $work->{'details'};
	    if($details->{'authors'}) {
		print "AU  - " . $details->{'authors'} . "\r\n";
	    }
	    if($details->{'title'}) {
		print "TI  - ". $details->{'title'} . "\r\n";
	    }	
	    if($details->{'journal'}) {
		print "T2  - ". $details->{'journal'} . "\r\n";
	    }
	    if($details->{'issue'}) {
		print "IS  - ". $details->{'issue'} . "\r\n";
	    }
	    if($details->{'volume'}) {
		print "VL  - ". $details->{'volume'} . "\r\n";
	    }
	    if($details->{'edition'}) {
		print "ET  - ". $details->{'edition'} . "\r\n";
	    }
	    if($details->{'publisher'}) {
		print "PB  - ". $details->{'publisher'} . "\r\n";
	    }
	    if($details->{'pubwhere'}) {
		print "CY  - ". $details->{'pubwhere'} . "\r\n";
	    }
	    if($details->{'isbn'}) {
		print "SN  - ". $details->{'isbn'} . "\n";
	    }
	    if($details->{'issn'}) {
		print "SN  - ". $details->{'issn'} . "\n";
	    }
	    if($details->{'other_info'}) {
		print "OP  - ". $details->{'other_info'} . "\n";
	    }
	    if($details->{'pages'}) {
		# Have to hack pages into start/end pages.
		if($details->{'pages'} =~ /(\d+)\-(\d+)/) {
		    print "SP  - $1\r\nEP  - $2\r\n";
		} elsif($details->{'pages'} =~ /(\d+)/) {
		    print "SP  - $1\r\n";
		} else {
		    print "SP  - " . $details->{'pages'} . "\r\n";
		}
	    }
	    if($details->{'pubdate'}) {
		if($details->{'pubdate'} =~ /(\d{4})/) {
		    my $pubyear = $1;
		    print "PY  - ".sprintf("%04d\n", $pubyear). "\n";
		}
	    }
	    if($details->{'url'}) {
		print "UR  - ". $details->{'url'} . "\n";
	    }
	}
    }
    
}

sub OutputJSON {
    my $params;
    %$params = @_;

    my $returns = $params->{'returns'} || {};
    my $code = $params->{'code'} || 0;
    my $callback = $params->{'callback'} || '';

    my $json = JSON->new->allow_nonref;
    if($code == ERR_SUCCESS) {
	if($callback) {
	    print "Content-type: application/javascript;charset=utf-8\n\n";
	    my ($f) = $callback =~ m/([A-Za-z_][0-9A-Za-z_]*)/;
	    print "$f(".$json->encode($returns).");";
	} else {
	    print "Content-type: application/json\n\n";

	    print $json->pretty->encode($returns);
	}
    }
}

sub SearchForm {
    print $q->header();
    print $q->start_html();
    print $q->h1('Citation Importer');
    print $q->p('Please supply a file containing citations');
    print $q->start_form();
    print $q->filefield(
	-name => 'file',
	-size=>50,
	-maxlength=>80,
	);
    print $q->p('Output format: '.
		$q->popup_menu(-name => 'format',
			       -values => ['RIS', 'JSON'],
			       -default => 'RIS',
		)
	);
    print $q->p('If JSON output, specify a call back for JSONP (if desired):',
		$q->textfield(-name => 'callback'));
    print $q->submit();
    print $q->end_form();
    print $q->end_html();
}