#!/usr/local/bin/perl5
# /**********************************************************************
#  *                              STARTS                                *
#  *      Stanford Protocol for Internet Retrieval and Search           *
#  *                                                                    *
#  *                      Reference Implementation                      *
#  *                                                                    *
#  *                           Carl Lagoze                              *
#  *                      lagoze@cs.cornell.edu                         *
#  *                   Department of Computer Science                   *
#  *                        Cornell University                          *
#  *********************************************************************/

# This perl script serves as a CGI link between a HTTP server and the 
# StartsServer application.  The set of URL's that it is built to handle
# are as follows:
# /STARTS/FORMINPUT - a POST request from the StartsInput.htm input form. The
# contents of the POST is the form data.
# /STARTS/QUERY/<source> - a POST request for a query to the specified
# source.  The contents of the POST is the SOIF containing the STARTS
# query.
# /STARTS/CONTENTSUMMARY/<source> - a request for a content summary of
# the specified source.
# /STARTS/SOURCEMETA/<source> - a request for the source metadata of the 
# specified source.
# /STARTS/RESOURCEMETA - a request for the resource metadata of this server.
# /STARTS/SAMPLE/<source> - a request for the sample database results from
# the specified source (unimplemented at this time).
#
# The configuration file for the HTTP server should be configured to pass
# these STARTS request to this CGI script.

$maintainer = "lagoze\@cs.cornell.edu";

# Process the CGI request
sub process_request {
    $request_method = $ENV{'REQUEST_METHOD'};

    # all requests are POST
    if ($request_method eq "POST") {
	if (!defined($ENV{'CONTENT_LENGTH'})) {
	    print "HTTP/1.0 400 Bad request\n";
	    print "Allow: GET, HEAD, POST\n";
	    print "Content-type: text/html\n\n";
	    print "<HTML><HEAD>\n";
	    print "<TITLE>400 Bad request</TITLE>\n";
	    print "</HEAD><BODY>\n";
	    print "<H1>POST request did not include a CONTENT_LENGTH</H1>\n";
	    print "</BODY><HTML>\n";
	    exit;
	}
        read(STDIN, $query_string, $ENV{'CONTENT_LENGTH'});
    }
    elsif ($request_method eq "GET") {
    }
    else {
        print "HTTP/1.0 405 Method Not Allowed\n";
        print "Allow: GET, POST\n";
        print "Content-type: text/html\n\n";
        print "<HTML><HEAD>\n";
        print "<TITLE>405 Method Not Allowed</TITLE>\n";
        print "</HEAD><BODY>\n";
        print "<H1>405 Method Not Allowed</H1>\n";
        print "Only POST, GET, and HEAD are supported\n";
        print "</BODY><HTML>\n";
	exit;
    }

    $script_name = $ENV{'SCRIPT_NAME'};

    # STARTS URL's look like /STARTS/<op>.  For all but RESOURCEMETA
    # and FORMINPUT it looks like /STARTS/<op>/<sourceid>
    $script_name = substr($script_name, 1);
    ($header, $operation, $source) = split('/', $script_name);

    if ($operation eq "FORMINPUT") {
	&form_input($query_string);
    }
    elsif ($operation eq "QUERY") {
	&send_query($source, $query_string);
    }
    elsif ($operation eq "CONTENTSUMMARY") {
	&send_source_content_summary($source);
    }
    elsif ($operation eq "SOURCEMETA") {
	&send_source_meta($source);
    }
    elsif ($operation eq "RESOURCEMETA") {
	&send_resource_metadata();
    }
    elsif ($operation eq "SAMPLE") {
	&send_source_sample($source);
    }
    else {
	print "HTTP/1.0 400 Bad request\n";
	print "Unknown STARTS operation\n";
	print "Content-type: text/html\n\n";
	print "<HTML><HEAD>\n";
	print "<TITLE>400 Bad request</TITLE>\n";
	print "</HEAD><BODY>\n";
	print "<H1>Unknown operation: $operation</H1>\n";
	print "</BODY><HTML>\n";
	exit;
    }

}

# Extract the FORMINPUT data.
sub form_input {
    local($query_string) = @_;

    @in = split(/&/,$query_string);
    
    foreach $i (0 .. $#in) {
	# Convert plus's to spaces
	$in[$i] =~ s/\+/ /g;
	
	# Split into key and value.  
	($key, $val) = split(/=/,$in[$i],2); # splits on the first =.
	
	# Convert %XX from hex numbers to alphanumeric
	$key =~ s/%(..)/pack("c",hex($1))/ge;
	$val =~ s/%(..)/pack("c",hex($1))/ge;
	
	# Associate key and value
	$in{$key} .= "\0" if (defined($in{$key})); # \0 is the multiple separator
	$in{$key} .= $val;
    }

    if ($in{"Service"} eq "Query") {
	&send_form_query(*in);
    }
    elsif ($in{"Service"} eq "Source Metadata") {
	&send_source_metadata($in{"Source"});
    }
    elsif ($in{"Service"} eq "Source Content Summary") {
	&send_source_content_summary($in{"Source"});
    }
    elsif ($in{"Service"} eq "Resource Metadata") {
	&send_resource_metadata();
    }
}

# Turn the form data into a QUERY soif.
sub send_form_query {
    local(*in) = @_;
    $querySoif = '@SQuery{Version{10}:	STARTS 1.0 ';

    foreach $tag (
		  "FilterExpression", 
		  "RankingExpression", 
		  "DropStopWords",
		  "DefaultAttributeSet",
		  "DefaultLanguage",
		  "AnswerFields",
		  "SortByFields",
		  "MinDocumentScore",
		  "MaxNumberDocuments"
		  ) 
        {

	    if (defined($in{$tag}) && length($in{$tag}) > 0) {
		if ($tag eq "DropStopWords") {
		    $in{$tag} = substr($in{$tag}, 0, 1);
		}
		$querySoif .= 
		    $tag. '{' . length($in{$tag}) . '}' . ":	" . $in{$tag};
	    }
	}
    if ($in{"AllSources"} eq "yes") {
	if ($in{"Source"} eq "cstr") {
	    $otherSource = "linux";
	}
	else {
	    $otherSource = "cstr";
	}
	$querySoif .= 
	    "Sources" . '{' . length($otherSource) . '}' . ":	" . $otherSource;
    }
    $querySoif .= '}';

     open(TMPH, ">/tmp/foo");
    print TMPH $querySoif,"\n";
   &send_query($in{"Source"}, $querySoif);
}
	
# send a QUERY service request
sub send_query{
    local($source, $querySoif) = @_;
    print S "QUERY\n";
    print S $source,"\n";
    print S $querySoif,"\n";
}

# send a SOURCEMETA service request.
sub send_source_metadata {
    local($source) = @_;
    print S "SOURCEMETA\n";
    print S $source,"\n";
}

# send a SOURCECONTENT service request.
sub send_source_content_summary {
    local($source) = @_;
    print S "SOURCECONTENT\n";
    print S $source,"\n";
}

# send a RESOURCEMETA service request
sub send_resource_metadata {
    print S "RESOURCEMETA\n";
}

# host is the host on which the StartsServer application is running.  port
# is the port on which StartsServer is listenning at that host.
$host = "cs-tr.cs.cornell.edu";
$port = 6789;

# Following two are OS specific often
$AF_INET = 2;
$SOCK_STREAM = 2;

$sockaddr = "S n a4 x8";

# return the internet address of a server and port as a socket address
sub Get_Internet_Address {
    local ($server, $port) = @_;
    local ($name, $aliases, $type, $len, $destaddr) = gethostbyname ($server);
    if ($destaddr eq "") {
    	0;
    }
    else {
    	pack ($sockaddr, $AF_INET, $port, $destaddr);
    }
}


if (!($address = &Get_Internet_Address ($host, $port))) {
    print "HTTP/1.0 500 STARTS error\n";
    print "Content-type: text/plain\n\n";
    print "Can't connect to STARTS server at $host $port.\n";
    print "Contact $maintainer\n";
    exit;
}
else {
    local ($name, $aliases, $protocol) = getprotobyname("tcp");
    if (! socket (S, $AF_INET, $SOCK_STREAM, $protocol)) {
	print "HTTP/1.0 500 STARTS error\n";
	print "Content-type: text/plain\n\n";
	print "Can't connect to STARTS server at $host $port.\n";
	print "Contact $maintainer\n";
	exit;
    }
    else {

	local ($attempts) = 0;
	local ($connected) = 0;

	while ((! $connected) && ($attempts < 5)) {
	    if (connect (S, $address)) {
		$connected = 1;}
	    else {
		sleep 2;
		$attempts++;
	    }
	}

	if (! $connected) {
	    print "HTTP/1.0 503 STARTS Server Unavailable \n";
	    print "Content-type: text/html\n\n";
	    print "Could not connect to STARTS server.\n";
	    print "Please contact $maintainer.\n";
	    exit;
	}
	else {
	    select (S);
	    $| =1 ;
	    &process_request();
	    while(<S>) {print STDOUT $_;}	
	}
    }
}


1;
