#!/usr/bin/python
# preprocess DUC '04 data

import os
import sys
import shutil

if not os.path.isfile("duc2003.breakSent.tar.gz"):
	print "Missing data file: 'duc2003.breakSent.tar.gz'!"
	print "Please obtain it from the DUC website http://duc.nist.gov/duc2004/software/"
	print "and place into the same directory as this script."
	sys.exit()

if not os.path.isfile("duc04.results.data.tar.gz"):
	print "Missing data file: 'duc04.results.data.tar.gz'!"
	print "Please obtain DUC '04 dataset from the DUC website http://duc.nist.gov/"
	print "and place into the same directory as this script."
	sys.exit()

print "Processing DUC '04 dataset..."

# extract DUC data
os.system("tar xzf duc2003.breakSent.tar.gz")
os.system("tar xzf duc04.results.data.tar.gz")
shutil.move("past_duc/duc2004/results", "results")
shutil.move("past_duc/duc2004/testdata", "testdata")
os.chdir("results/ROUGE")
os.system("tar xzf duc2004.task2.ROUGE.models.tar.gz")
os.chdir("../../testdata/tasks1and2")
os.system("tar xzf duc2004.tasks1and2.docs.tar.gz")
os.chdir("../..")

# process manual summaries
shutil.copytree("results/ROUGE/eval/models/2", "abs")
execfile("../scripts/convabs3.py")

# process documents
shutil.copytree("testdata/tasks1and2/t1.2/docs", "docs")
execfile("../scripts/convdocs4.py")
execfile("../scripts/convhdr4.py")

# convert into final format
for i in glob.glob("abs/sum_*"):
	shutil.copy(i, "docs")
os.chdir("docs")
execfile("../../scripts/mksvmds4.py")
os.chdir("..")
os.rename("docs", "data")
execfile("../scripts/wmapunpik.py")
execfile("../scripts/cntcfs1incl.py")
shutil.copy("../scripts/stops", "data")
os.chdir("..")

print "Done."


