'''This file was modified 10-08-09 at approximately 6:30pm''' def vocabulary(word_and_punctuation_instances, punctuation): '''word_and_punctuation_instances is a list of strings; punctuation is a list of punctuation characters. Return a new list that is the vocabulary of word_list, that is, all the distinct elements of word_and_punctuation_instances that are not in the punctuation list. By all the distinct elements, we mean the list returned does not include duplicates.''' vocabulary = [] for word in word_and_punctuation_instances: if not word in vocabulary and not word in punctuation: vocabulary.append(word) return vocabulary def analyze_vocabulary(word_and_punctuation_instances, punctuation): '''Determine and print the number of distinct words in word_and_punctuation_instances, as well as a sorted list of the distinct words. "punctuation" is a list of punctuation characters. They may occur in word_and_punctuation_instances, but are not treated as words.''' v = vocabulary(word_and_punctuation_instances, punctuation) print "The number of distinct words is: ", len(v) v.sort() print "And the word list is: ", v def print_stats(word_and_punctuation_instances, within_sentence_punctuation, \ end_sentence_punctuation): '''"word_and_punctuation_instances" is a list of strings; "within_sentence_punctuation" and "end_sentence_punctionation" are lists of punctuation characters that can occur in the middle of a sentence, or at the end of a sentence, respectively. Print statistics about the words, sentences, and readability of word_and_punctuation_instances''' # Add code here to make your calculations. # Calculate the number of distinct words numDistinctWords = len(vocabulary(word_and_punctuation_instances, \ within_sentence_punctuation + end_sentence_punctuation)) # following would also be a good function # It calculates the number of words, that is, all the elements of # word_and_punctuation_instances that are not punctuation words = [] for wp in word_and_punctuation_instances: if not wp in within_sentence_punctuation + end_sentence_punctuation: words.append(wp) totalWordLength = 0 for w in words: totalWordLength = totalWordLength + len(w) aveWordLength = totalWordLength / float(len(words)) # Now, find the sentences, counting how many there are. # Note that we already have the numerator for the average sentence length - # it is len(words). So, we don't need to count the sizes of the sentences # as we go along. number_sentences = 0 for wp in word_and_punctuation_instances: if wp in end_sentence_punctuation: number_sentences += 1 # In each print statment below, replace the value 99 with an appropriate # expression. print "Number of distinct words: ", numDistinctWords print "Number of words: ", len(words) print "Average word length: ", aveWordLength print "Number of sentences: ", number_sentences print "Average sentence length: ", float(len(words))/number_sentences print "ARI: ", 4.71 * (totalWordLength/float(len(words))) \ + 0.5 * (len(words) /float(number_sentences)) - 21.43 def separate_punctuation(strings, punctuation_list): '''strings is a list of strings. "punctuation" is a list of punctuation characters. Return a new list that is the same as strings, but for every string that ends in a punctuation character, the string (without the punctuation) and the punctuation are separated and made two elements of the list.''' new_strings = [] for s in strings: if s[-1] in punctuation_list: word = s[:-1] punct = s[-1] new_strings.append(word) new_strings.append(punct) else: new_strings.append(s) return new_strings def print_one_line(list): '''Print the strings in 'list' on a single line, with commas separating them.''' for item in list: print item + ",", def get_valid_command(valid): '''Prompt for and return a string that is a valid command, i.e., a string in the list "valid".''' # Uncomment out the following two lines and use them, as many times as # needed, as your ONLY input and output in this function: command = raw_input("Please enter a command: ") while not command in valid: print "Invalid command" command = raw_input("Please enter a command: ") return command if __name__ == "__main__": # Get the name of a file to analyze, and put its contents into a # list of strings. filename = raw_input("File to analyze: ") f = open(filename, 'r') contents = f.read().split() print "Raw contents, for debugging - don't print this out in your final solution" print contents # inner_punct is a list of within-sentence punctuation. # end_punct is a list of end-of-sentence punctuation. # punct is a list of all punctuation characters. inner_punct = [",", ";", "-"] end_punct = ["!", "?", "."] punct = inner_punct + end_punct # Separate any punctuation that is attached to the end of words. contents = separate_punctuation(contents, punct) print "Contents after separating out punctuation, for debugging - don't print this out in your final solution" print contents # Tell the user what the valid commands are, then # read and process the user's command. valid_commands = ["stats", "vocab"] print "The valid commands are:", print_one_line(valid_commands) command = get_valid_command(valid_commands) if command == "stats": print_stats(contents, inner_punct, end_punct) elif command == "vocab": analyze_vocabulary(contents, punct)