#!/usr/bin/python
# preprocess DUC '03 data

import os
import sys
import shutil

if not os.path.isfile("duc2003.breakSent.tar.gz"):
	print "Missing data file: 'duc2003.breakSent.tar.gz'!"
	print "Please obtain it from the DUC website http://duc.nist.gov/duc2004/software/"
	print "and place into the same directory as this script."
	sys.exit()

if not os.path.isfile("duc03.results.data.tar.gz"):
	print "Missing data file: 'duc03.results.data.tar.gz'!"
	print "Please obtain DUC '03 dataset from the DUC website http://duc.nist.gov/"
	print "and place into the same directory as this script."
	sys.exit()

print "Processing DUC '03 dataset..."

# extract DUC data
os.system("tar xzf duc2003.breakSent.tar.gz")
os.system("tar xzf duc03.results.data.tar.gz")
os.chdir("results")
os.system("tar xzf detagged.duc2003.abstracts.tar.gz")
os.chdir("../testdata/task2")
os.system("tar xzf task2.docs.tar.gz")
os.chdir("../..")

# process manual summaries
shutil.copytree("results/detagged.duc2003.abstracts/peer7.2", "abs")
execfile("../scripts/convabs3.py")

# process documents
shutil.copytree("testdata/task2/docs", "docs")
execfile("../scripts/convdocs4.py")
execfile("../scripts/convhdr4.py")

# convert into final format
for i in glob.glob("abs/sum_*"):
	shutil.copy(i, "docs")
os.chdir("docs")
execfile("../../scripts/mksvmds4.py")
os.chdir("..")
os.rename("docs", "data")
execfile("../scripts/wmapunpik.py")
execfile("../scripts/cntcfs1incl.py")
shutil.copy("../scripts/stops", "data")
os.chdir("..")

print "Done."


