# a3_soln.py # Lillian Lee (LJL2) with contributions by Victoria Litvinova and Austen Joa # Apr 16 2018 """Solutions to A3, including alternate solutions. Some of the solutions show you how you could use map or filter, although those were disallowed for certain parts of the assignment.""" from tkinter import filedialog # Get visual request for file selection import urllib.request # Get vocabulary from a webpage import string # Get some useful string built-in values import os import sys from sources import econ_terms ECON_VOCAB = econ_terms.ECON_VOCAB # STUDENTS: this function has been completed for you. def get_content_lines(fname=None): """Given filename fname, return a list of normalized lines in the file, except that lines that are 'commented out', in the sense of the first non-whitespace character being a '#', are not included. EXCEPTION: if no argument is given, the "fname=None" in the function header sets fname to None (the actual value, NOT the string "None"), which, in the code below, causes the filename to be retrieved from the user via a visual file-open dialog. The normalization is almost exactly as described in Section 13.3 "Word histogram" in the text, and in particular the function process_line: hyphens aref replaced with spaces (so "highly-flammable programs" would become three words); punctuation at the beginnings and ends of words is removed (so that "Really?", "Really!" and "Really" are all treated as the same word), and all words are lower-cased (so that "CS1110" and "cs1110" are treated as the same word.) Leading or trailing whitespace is removed, and all line-internal whitespace is replaced by a single space. However, a "\n" is added to the end of every line. Precondition: fname is the name of a plain-text file, OR it is not given by the caller (in which case Python will set parameter fname to None). """ output = [] # Initialize our accumulator # This is how to check if something is None # (Pythonistas don't use == for None) if fname is None: # Fill in fname using a visual dialog window fname = filedialog.askopenfilename(filetypes=[("Text Files", "*.txt")], title="Choose an input file") # About open(), see section 9.1 "Reading word lists" of the text. # "with" makes sure file opening and closing is cleanly done. # The 'r' means one can only read the file, not change it. with open(fname, mode='r', encoding='utf-8') as fp: # See Section 13.3 "Word histogram" on looping through a file's lines for line in fp: left_justified_line = line.lstrip() # Remove leading whitespace if len(left_justified_line) == 0 or left_justified_line[0] != '#': # Either line was empty or it wasn't a comment. line = line.replace('-', ' ') words_in_line = line.split() for ind in range(len(words_in_line)): # Iterate over indices to keep track of which position in # words_in_line to be replaced. word = words_in_line[ind] word = word.strip(string.punctuation + string.whitespace) word = word.lower() words_in_line[ind] = word # Replace with normalized version output.append(' '.join(words_in_line) + '\n') return output def convert_lines_to_string(linelist): """ Returns: a single string that is the concatenation of every non-empty line in linelist except each such line has been stripped of leading and trailing whitespace (including newlines), and there is a single space between what used to be adjacent lines (ignoring lines that were originally empty strings before the stripping of whitespace). Example input and output: ["hi\n", "there"] -> "hi there" [" hola ", " salut \n howdy "] -> "hola salut \n howdy" ["so", "la", "", "do"] -> "so la do" ["so", "\n", "la", "", "", "do"] -> "so la do" ** note the two spaces between "so" and "la" Precondition: linelist is a (possibly empty) list of strings. """ # STUDENTS: Complete this implementation so that it satisfies its # specification, with the following constraints. # You MUST make effective use of a for-loop whose header is # for line in linelist: # (We are testing whether you can work with loops directly over items in # a given list.) # # Implementations that use the join() string method and/or `map` # will not receive credit. # # Hint: dealing with lines that are the empty string can be tricky; watch # out that you don't get extra spaces. # Our lives would be dramatically eased if there only there weren't # any empty lines to handle ... so let's get of them in a first pass! # (Thus, we see the power of wishful thinking.) non_empties = [] # Create a list of just the non-empty lines for line in linelist: if len(line) > 0: non_empties.append(line) if len(non_empties) == 0: return '' # There were no non-empty lines # If we get here, non_empties has at least one item output = non_empties[0].strip() # Add the first line as is for line in non_empties[1:]: # Subsequent lines are preceded by a space output += (' ' + line.strip()) return output def convert_lines_to_string2(linelist): """Same specification as convert_lines_to_string()""" # STUDENTS: Complete this implementation so that it satisfies its # specification, with the same implementation requirements # as for convert_lines_to_string(), EXCEPT: # # You MUST make effective use of a for-loop whose header is either # for ind in range(len(linelist)): # or # for ind in list(range(len(linelist))): # (We are testing whether you can work with loops over the indices of # a given list.) # Implementations that use the join() string method and/or `map` # will not receive credit. non_empties = [] # Create a list of just the non-empty lines for ind in list(range(len(linelist))): line = linelist[ind] if len(line) > 0: non_empties.append(line) if len(non_empties) == 0: return '' # There were no non-empty lines # non_empties contains at least one item output = non_empties[0].strip() # Put the first line in for ind in range(1, len(non_empties)): # Subsequent lines are preceded by a space item = non_empties[ind] output += (' ' + item.strip()) return output # Helper function def _not_empty_line(line): """Returns True if len(line) > 0, False otherwise. Precondition: line is a string.""" return len(line) > 0 def convert_lines_to_string3(linelist): """Same specification as convert_lines_to_string()""" # This implementation uses filter with a helper non_empties = list(filter(_not_empty_line, linelist)) if len(non_empties) == 0: return '' output = non_empties[0].strip() for line in non_empties[1:]: output += (' ' + line.strip()) return output def convert_lines_to_string4(linelist): """Same specification as convert_lines_to_string()""" # This implementation uses filter with a lambda expression non_empties = list(filter(lambda x: len(x) > 0, linelist)) if len(non_empties) == 0: return '' output = non_empties[0].strip() for line in non_empties[1:]: output += (' ' + line.strip()) return output def convert_lines_to_paragraphs(linelist): """ Returns: a list of the paragraph-strings corresponding to the paragraphs in linelist. Each paragraph in linelist is a maximal contiguous subsequence of lines in linelist such that the sequence does not contain a blank line. A blank line is exactly the string "\n". A paragraph-string is the result of running convert_lines_to_string() or convert_lines_to_string2() on a paragraph. If linelist is empty, or if all the lines in linelist are empty, returns the empty list. See the test cases in a3test.test_convert_lines_to_paragraph() for examples. Precondition: linelist is a (possibly empty) list of strings. """ # STUDENTS: Complete this implementation so that it satisfies its # specification, with the following constraints. # You MUST make effective use a for-loop. # You must decide for yourself whether it is better to loop over linelist # or over the indices of linelist, or whether it matters. # (We have solutions for either strategy, but one seemed a little trickier # than the other.) # # There are a lot of subtleties here. # First, take a close look at the test cases we have given you in # a3text.test_convert_lines_to_paragraph(). # Notice that you need to handle cases in which there are multiple # consecutive blank lines. # Try figuring out what you as a human do to get the right answer on the # all the test cases, before you try to implement your strategy in Python. # If you don't understand a test input or test output, ASK SOMEONE # BEFORE CODING! # # Suggested strategy: you know you want to build up a list that consists of # strings, so it seems that an accumulator variable that you add/append to # makes sense. But each of those strings is created by merging together a # set of lines. So you might have another variable that stores the lines in # the current paragraph (as defined above). When the current paragraph is # done, run convert_lines_to_string() on that current-paragraph list, and # then reset that current-paragraph list to get ready for the next paragraph. if linelist == []: return [] output = [] curr_stanza_lines = [] # We'll want to run convert_l._t._s. on this for line in linelist: if line == '\n' and len(curr_stanza_lines) > 0: # We have a stanza containing content to add output.append(convert_lines_to_string(curr_stanza_lines)) # Reset the current stanza's lines curr_stanza_lines = [] elif line != '\n': curr_stanza_lines.append(line) # Handle the last stanza, if any if curr_stanza_lines != []: output.append(convert_lines_to_string(curr_stanza_lines)) return output # An alternate solution def convert_lines_to_paragraphs2(linelist): """Same specification as above.""" if len(linelist) == 0: return [] output = [] i_stanza_start = 0 # Keep track of index of start of potential stanza for ind in range(len(linelist)): if linelist[ind] == '\n': # Hit the end of the current stanza (if there is one) if ind == i_stanza_start: # The current stanza is empty; the next one must start later i_stanza_start = ind + 1 else: stanza = linelist[i_stanza_start:ind] output.append(convert_lines_to_string(stanza)) i_stanza_start = ind+1 # Handle the last stanza, if any if i_stanza_start != len(linelist): output.append(convert_lines_to_string(linelist[i_stanza_start:])) return output def convert_lines_to_paragraphs3(linelist): """Same specification as convert_lines_to_paragraphs().""" if linelist == []: return [] output = [''] # Idea: string for the current paragraph will always be last # item in output for ind in list(range(len(linelist))): line = linelist[ind] if line != '\n': # Add this line to current last entry in output if output[-1] != '': output[-1] += ' ' # need to add separator space output[-1] += convert_lines_to_string([line]) else: # We are at a newline. if ind == 0 or linelist[ind-1] == '\n': # First line was blank or in a sequence of blank lines pass else: # We are not in a sequence of blank lines, and there was # prior content output.append('') if output[-1] == '' and linelist[-1] == '\n': # The anticipated next paragraph never happened output.pop() return output # STUDENTS: we ran this function to provide you a local version of the text # of the relevant webpages --- to speed things up for you (you don't have # to wait for webserver responses), and to keep the load light on the webserver # (we don't want to have 500+ students hitting The Economist's webserver over # and over again). def download_econ_vocab_data(fname): """(over)write into file fname the concatenation of text regarding economics-related terminology text from https://www.economist.com/economics-a-to-z/a https://www.economist.com/economics-a-to-z/b ... https://www.economist.com/economics-a-to-z/z Precondition: directory econ_dict is in the same directory as this file. """ with open(fname, mode='a+', encoding='utf-8') as fp: # Isn't it handy to be able to loop through strings? for letter in string.ascii_lowercase: # You can check that this is like what we did in A1, file # get_status_from_webpage.py data_name = 'https://www.economist.com/economics-a-to-z/' try: data_source = urllib.request.urlopen(data_name + letter) fp.write(data_source.read().decode('utf-8')) fp.write('\n\n') # have a separator between webpages except ValueError: print("Something is wrong with the web address or webpage.") sys.exit() def get_econ_vocab_helper(vlist_to_add_to, work_text): """Extends vlist_to_add_to with the list of the vocabulary items, lower-cased, in work_text, assumed to be well-formatted html""" # Add each

...

term or phrase to vlist_to_add_to, separating out # comma-separated phrases. i_start = work_text.find('

') if i_start == -1: # No more

tags, all done return else: work_text = work_text[i_start + len('

'):] try: i_end = work_text.index('

') # If no matching , # quit because the data is corrupt except: print('Data has an

without matching

.') sys.exit() # Deal with "G7, G8, G20" term_list = work_text[:i_end].split(',') for term in term_list: vlist_to_add_to.append(term.lower().strip()) work_text = work_text[i_end + len('') + 1:] # Recursive call! get_econ_vocab_helper(vlist_to_add_to, work_text) def get_econ_vocab(fname=None): """Returns a list of the vocabulary items in fname, lower-cased.""" if fname is None: # This is a file on Prof. Lee's computer. fname = ECON_DATA_FNAME with open(fname, mode='r', encoding='utf-8') as fp: outlist = [] get_econ_vocab_helper(outlist, fp.read()) return outlist def track_topic(docs_list, vocab_list): """ Returns: a list of the fraction of words in each document in docs_list that are in vocab_list. In more detail... Preconditions: * docs_list: a (possibly empty) list of nonempty strings, each of which contains at least one non-white-space character. We consider each item of docs_list to be a "document" where the "words" of the document are all the spans of characters that don't contain whitespace. No "words" contain beginning or ending punctuation, although internal punctuation is OK. So, this document hey howdy how's the weather has five words. This document xxx y z3!42 has three words. This document I have eaten\nthe plums\nthat were in\nthe icebox\n\nand which\n has 12 words. This is NOT a legal document: hey howdy, how's the weather??? * vocab_list is a non-empty list of non-empty strings that may contain spaces. We consider each item of vocab_list to be a target "word". No target word can have beginning or ending punctuation, although internal punctuation is OK. This function returns a new list outlist such that: * len(outlist) == len(docs_list), and * for each valid index `ind` of docs_list, outlist[ind] is the fraction of words in document docs_list[ind] that are found in vocab_list. The fraction should be a float rounded to three digits past the decimal point via the round() built-in function. Examples: if doclist[0] is "abc abcabc a a" and vocab_list is ["abc"], then outlist[0] should be .25 (i.e, 1/4). If doclist[1] is "abc abcabc a a" and vocab_list is ["ABC", "a"], then outlist[1] should be .5 (i.e., 2/4) If doclist[2] is "ab abab a a" and vocab_list is ["ABC", "a", "ab", "v", "abab"], then outlist[2] should be 1.0 (i.e., 4/4). The reason we disallow punctuation is to avoid having to decide whether a document "are you okay?" contains a word in the list ["okay"]. """ # STUDENTS: Complete this implementation so that it satisfies its # specification, with the following constraints. # You MUST make effective use a for-loop, and you might need to use a # nested for-loop. # # Hint: If you are counting how many vocab_list words occur in a given # document, don't forget to reset that count every time you start with a # new document. # # Hint: we found the string method split() to be quite useful. For a # (kind of unconventional) example of using split to get a convenient list # of words, see the file # http://www.cs.cornell.edu/courses/cs1110/2018sp/lectures/lecture12/modules/madlibs2.py outlist = [] for doc in docs_list: count = 0 # Number of vocab_list words found so far in current doc doc_words = doc.split() for word in doc_words: if word in vocab_list: count += 1 outlist.append(round(count/len(doc_words), 3)) return outlist # See the second half of section 14.9 "Writing modules" of the text for more # on this __name__ business. Basic idea: the indented code is only run if # this file is run by `python a3.py` on the command line if __name__ == '__main__': # econ_vocab = get_econ_vocab() # print(econ_vocab) # https://www.exaptive.com/blog/topic-modeling-the-state-of-the-union red_topic = ['make sure', 'company', 'college', 'republican', 'parent', 'medicare', 'bipartisan', 'kid', 'small business', 'global'] purple_topic = ['afghanistan', 'america', 'terror', 'troop', 'border', 'terrorist', 'violence', 'enemy', 'fighting', 'rule'] print("Demonstration of tracing a topic through a single speech: ") print("How the red topic trends through Obama's 2013 SOTU.") print(" (Topics typically exhibit such `bursty' behavior.)") fname = os.path.join('sources', '2013_obama.txt') obama13 = convert_lines_to_paragraphs(get_content_lines(fname)) print(track_topic(obama13, red_topic)) sotus = [] # State of the Union addresses, 2001-2018 for year in range(2001, 2009): fname = os.path.join('sources', str(year)+'_bush.txt') sotus.append(convert_lines_to_string(get_content_lines(fname))) for year in range(2009, 2017): fname = os.path.join('sources', str(year)+'_obama.txt') sotus.append(convert_lines_to_string(get_content_lines(fname))) for year in range(2017, 2019): fname = os.path.join('sources', str(year)+'_trump.txt') sotus.append(convert_lines_to_string(get_content_lines(fname))) red_trend = track_topic(sotus, red_topic) purple_trend = track_topic(sotus, purple_topic) import matplotlib.pyplot as plt plt.title("Topic trends in recent US State of the Union addresses") plt.ylabel("fraction of speech tokens on the topic") x = list(range(2001, 2019)) plt.plot(x, track_topic(sotus, ECON_VOCAB), 'b', marker='o', label="The Economist's economic terms") plt.plot(x, track_topic(sotus, red_topic), 'r', marker='o', label="Evans' red topic (selections: 'college', 'parent', ...)") plt.plot(x, track_topic(sotus, purple_topic), 'purple', marker='o', label="Evans' purple topic (selections: 'terrorist', 'enemy', ...)") labels = ["2001: Bush", "2005: Bush", "2009: Obama", "2013: Obama", "2017: Trump"] plt.xticks(range(2001, 2019, 4), labels, rotation=45, fontsize=6) plt.legend() plt.show() plt.close('all')