#!/usr/bin/perl -w
#
# Author:      Norbert Klasen
# E-mail:      klasen@zdv.uni-tuebingen.de
# 
# $Id: rfc-parse2ldif.pl,v 1.4 2001/04/06 18:07:38 zrdkn01 Exp $
#
# Description: parses rfc-index.txt and generates ldif          
#              
# Based on:    rfc-parse.pl by RFC.net
# Licence:     GPL

use English;
use strict;
my $basedn="cn=IETF,cn=bibtex,dc=directory,dc=dfn,dc=de";
my $baseurl="ftp://ftp.isi.edu/in-notes";
my ($line, $list);

while (<STDIN>)
{
	last if m/^0001/;
}

$list .= $_;

while (<STDIN>)
{
	$list .= $_;
}

$list =~ s/^    //gm;				# nuke leading indentation
$list =~ s/\n\n/%%##FOO##%%/gs;		# turn \n\n into a place holder
$list =~ s/\n//gs;					# nuke lone \n's
$list =~ s/%%##FOO##%%/\n/gs;		# place holder -> \n

# we now have the rfc indexes on single lines in one var.
# next, split into an array.

my (@index);
@index = split /\n/, $list;

# holds real 'data'
my ($rfc, @things, $date, $year, $month, $day, $name, $texname, @authors, $writby, $months, $txt, $ps, $pdf, $also, $flags);
# pprocessing vairables;
my ($stuff, $end, $tmp, $x);

# used to match the other date format.
$months = "January|February|March|April|May|June|July|August|September|October|November|December";

foreach $line (@index)
{
	chomp $line;
	if ($line !~ /^\d\d\d\d/)
	{
		print $line;
	}
	($rfc, $stuff, $end) = ($line =~ /^(\d{4})(.+\.)([^.]+)$/);
	next if ( !defined($rfc) );

	#start ldif output
	print "dn: cn=RFC$rfc,$basedn\n";
	print "cn: RFC$rfc\n";
	print "objectclass: top\n";
	print "objectclass: bibtexEntry\n";
	print "objectclass: bibtexTechReport\n";
	print "objectclass: rfcStatusInformation\n";
	print "bibtexInstitution: IETF\n";
	print "bibtexNumber: $rfc\n";
	print "bibtexType: RFC\n";

	if ( $stuff =~ s/([a-zA-Z]{3})-(\d{1,2})-(\d{4})\.$// ) {
		$day = $2;
		$month = lc($1);
		$year = $3;
	} elsif ( $stuff =~ s/($months) (\d{4})\.$// ) {
		$month = lc(substr($1, 0, 3));
		$year = $2;
	} else {
		die "RFC$rfc: no date\n";
	}
	print "bibtexYear: $year\n";
	print "bibtexMonth: $month\n";


	# Title F.O.O. title. A. Author, A. Author.
	#
	# but first - some quirks
	
	if ($rfc eq "0016") {
		($name, $stuff) = (" M.I.T. ", "S. Crocker");
	} elsif ($rfc eq "0270") {
		($name, $stuff) = ("Correction to BBN Report No. 1822 (NIC NO 7958). ",
						   "A.M. McKenzie");
	} elsif ($rfc eq "1790") {
		($name, $stuff) = ("An Agreement between the Internet Society and Sun Microsystems, Inc. in the Matter of ONC RPC and XDR Protocols. ",
						   "V. Cerf");
	} elsif ($rfc eq "2339") {
		($name, $stuff) = ("An Agreement Between the Internet Society, the IETF, and Sun Microsystems, Inc. in the matter of NFS V.4 Protocols. ",
						   "The Internet Society, Sun Microsystems");
	} elsif ($rfc eq "2429") {
		($name, $stuff) = ("RTP Payload Format for the 1998 Version of ITU-T Rec. H.263 Video (H.263+). ",
						   "C. Bormann, L. Cline, G. Deisher, T. Gardos, C. Maciocco, D. Newell, J. Ott, G. Sullivan, S. Wenger, C. Zhu");
	} elsif ($rfc eq "2657") {
		($name, $stuff) = ("LDAPv2 Client vs. the Index Mesh. ","R. Hedberg");
	} else {
		($name) = ($stuff =~ /^(.*?([^. ]{2,}\. ){1})/);

		if (!defined($name) or !defined($stuff))
		{
			print "XXX $rfc XXX\n";
			print $line."\n";
		}

		if ($name eq $stuff)
		{
			($name) = ($stuff =~ /^(.*?([0-9]\. ){1})/);
		}

		if ($name eq $stuff)
		{
			# XXX this was used to generate a quirk table that the above regexp obsoletes.
			print ' elsif ($rfc eq "'.$rfc."\") {\n";
			print "\t\t".'($name, $stuff) = ("'.$stuff."\");\n";
			print "\t}"
		}
		my ($tmp) = $name;
		$tmp =~ s/(\W)/\\$1/g;
		$stuff =~ s/^$tmp//;

	}
	$name =~ s/^ //;
	$stuff =~ s/\. $//;

	@authors = undef;
	my (@newauthors, $author);

	if ($rfc eq "0907") {
		($name, $authors[0]) = ("Host Access Protocol specification. ",
						   		"Bolt, Beranek and Newman, Inc.");
	} elsif ($rfc eq "1950"){
		$name = "ZLIB Compressed Data Format Specification version 3.3. ";
		@authors = ("P. Deutsch", "J-L. Gailly");
	} elsif ($rfc eq "2188"){
		$name = "AT&T/Neda's Efficient Short Remote Operations (ESRO) Protocol Specification Version 1.2. ";
		@authors = ("M. Banan", "M. Taylor", "J. Cheng");
	} elsif ($rfc eq "2471"){
		$name = "IPv6 Testing Address Allocation. ";
		@authors = ("R. Hinden", "R. Fink", "J. Postel");
	} elsif ($rfc eq "1932") {
		$name = "IP over ATM: A Framework Document. ";
		@authors = ("R. Cole", "D. Shur", "C. Villamizar");
	} elsif ($stuff =~ /\,/) {
		@authors = split ',', $stuff;
		foreach $author (@authors)
		{
			if ($author =~ /\&/)
			{
				@newauthors = (@newauthors, split '&', $author);
			} else {
				@newauthors = (@newauthors, $author);
			}
		}
		@authors = @newauthors;
	} elsif ($stuff =~ /\&/) {
		@authors = split '&', $stuff;
	} else {
		$authors[0] = $stuff;
	}

	#trim whitespaces
	for ($x = 0 ; $x <= $#authors ; $x++)
	{
		$authors[$x] =~ s/^\s+//;
		$authors[$x] =~ s/\s+$//;		
	}


	#if (length($name) > 255) { die "RFC$rfc has a name longer than 255"; }

	$name =~ s/\. $//;
	#preserve case in tex
	($texname = $name) =~ s/(([a-zA-Z0-9&-]*[A-Z]){2}[a-zA-Z0-9&-]*)/{$1}/g;
	#ITU recommendations
	$texname =~ s/([A-Z](\.\d+)+(\(\d+\))?)/{$1}/g;
	print "bibtexTitle: $name\n";
	if ( $name ne $texname ) {
		print "bibtexTitle;lang-x-tex: $texname\n";
	}

	foreach $author ( @authors ) {
		print "bibtexAuthor: $author\n";
	}
	if ( @authors > 1 ) {
		print "bibtexAuthor;lang-x-tex: " . join(" and ", @authors) . "\n";
	}
	
	
	@things = ($end =~ /(\([^()]+\))/g);
	my ($attr, $status, $link);
	
	foreach $attr (@things)
	{
		if ($attr =~ /^\(Format: /)
		{
			my ($formats, @things, $format, $thing, $size);
			# XXX this should be changed to use another table (?) XXX
			#(Format: TXT=9107 bytes)
			#(Format: TXT=22254, PS=176169 bytes)
			#(Format: TXT=120959, PS=534729, PDF=255616 bytes)
			$attr =~ /^\(Format: (.*) bytes\)$/;
			$formats = $1;
			@things = split /, /,$formats;
			foreach $format (@things) {
				($thing, $size) = split /=/, $format;
				if ($thing eq "TXT") {
					print "labeledUri: $baseurl/rfc$rfc.txt ASCII\n";
				} elsif ($thing eq "PS") {
					print "labeledUri: $baseurl/rfc$rfc.ps PostScript\n";
				} elsif ($thing eq "PDF" ) {
					print "labeledUri: $baseurl/rfc$rfc.pdf PDF\n";
				} else {
					print STDERR "XXX RFC$rfc Unknown Format in $attr: $thing (= $size)\n";
				}
			}
		} elsif ($attr =~ /^\(Obsoleted by (.*?)\)$/) {
			foreach $link ( split /, /, $1 ) {
				print "rfcObsoletedBy: cn=".fix_rfc($link).",$basedn\n";
			}
		} elsif ($attr =~ /^\(Obsoletes (.*?)\)$/) {
			foreach $link ( split /, /, $1 ) {
#				$link =~ s/ //g;
				print "rfcObsoletes: cn=".fix_rfc($link).",$basedn\n";
			}
		} elsif ($attr =~ /^\(Updates (.*?)\)$/) {
#			$attr = $1;
#			if ($rfc eq '2231') { $attr = 'RFC2045, RFC2047, RFC2183'; }
			foreach $link ( split /, /, $1 ) {
				print "rfcUpdates: cn=".fix_rfc($link).",$basedn\n";
			}
		} elsif ($attr =~ /^\(Updated by (.*?)\)$/) {
			foreach $link ( split /, /, $1 ) {
				print "rfcUpdatedBy: cn=".fix_rfc($link).",$basedn\n";
			}
		} elsif ($attr =~ /^\(Status: ([A-Z ]+)\)$/) {
#			if (defined($status))
#			{
#				die "RFC$rfc more than one status: ".$attr." :: $status";
#			}
#			$attr =~ /^\(Status: ([A-Z ]+)\)$/;
#			$status = $1;
			print "rfcStatus: $1\n";
		} elsif ($attr =~ /^\(Also (.*?)\)$/) {
			foreach $link ( split /, /, $1 ) {
				print "seeAlso: cn=".fix_rfc($link).",$basedn\n";
			}
		} elsif ($attr =~ /^\(Not online\)$/) {
			#
		} else {
			print STDERR "XXX RFC$rfc Unknown Attribute: ".$attr."\n";
		}
	}
	
	##
	## now we have parsed everything - shove it into the db.
	##
	
	print "\n";
}


exit(0);


sub fix_rfc()
{
	my ($rfc) = @_;
	
	if ($rfc =~ /^\s*(RTR|RFC|STD|BCP|FYI|IEN|NIC)\s*(\d+)\s*$/)
	{
		$rfc = sprintf("%s%04d", $1, $2);
		return($rfc);
	} elsif ($rfc =~ /^\s*(\d+)\s*$/){
		$rfc = sprintf("RFC%04d", $1);
		return($rfc);
	} else {
		die "unable to parse: ".$rfc;
	}
}

sub fix_month()
{
	my ($month) = @_;
	return lc(substr($month,1,3));
}

sub fix_date()
{
	my ($date) = @_;
	my ($months, $mon, %months, %mon, $x, $y);

	$months = "January|February|March|April|May|June|July|August|September|October|November|December";
	$y = 1; foreach $x (split /\|/, $months) { $months{$x} = $y; $y++; }
	
	$mon = "Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec";
	$y = 1; foreach $x (split /\|/, $mon) { $mon{$x} = $y; $y++; }
	
	#May-01-1969
	#April-01-1969 <- watch out!
	#January 2000
	
	if ($date =~ /^($mon)-(\d{1,2})-(\d\d\d\d)$/)
	{
		$date = sprintf("%s-%02d-%02d", $3, $mon{$1}, $2);
	} elsif ($date =~ /^April-(\d{1,2})-(\d\d\d\d)$/) {
		$date = sprintf("%s-03-%02d", $2, $1);
	} elsif ($date =~ /^^($months) (\d\d\d\d)$/) {
		$date = sprintf("%s-01-%02d", $2, $months{$1});
	} else {
		die "XXX unknown date format: ".$date."\n";
	}
	
}

sub commify_series {
    (@_ == 0) ? ''                                      :
    (@_ == 1) ? $_[0]                                   :
    (@_ == 2) ? join(" and ", @_)                       :
                join(", ", @_[0 .. ($#_-1)], "and $_[-1]");
}

