# a5.py # YOUR NAME(S) AND NETID(S) HERE # DATE COMPLETED HERE """Module for K-Means Clustering WRITE YOUR ESSAY HERE""" import random import numpy import math # HELPER FUNCTION FOR PART D def random_range(a,b,size): """Returns: A list of length size, with ints between a and b. The list is guaranteed to contain no duplicates, but is otherwise random. This is a helper function to make it easy to pick a list of distinct rows from a table. Example: random_range(1,5,3) => [2, 1, 4] Precondition: a, b, size are ints. b-a > size.""" assert type(a) == int, 'a='+`a`+' is not an int' assert type(b) == int, 'b='+`b`+' is not an int' assert type(size) == int, 'size='+`size`+' is not an int' assert b-a > size, 'size '+`size`+' exceeds range '+`a`+' to '+`b` result = [] while (len(result) < size): pick = random.randint(a,b-1) # Linear search if there is a collision. while pick in result: pick = (pick+1) % (b-a) result.append(pick) return result # HELPER FUNCTIONS FOR ASSERTS GO HERE # PART I: Classes class Database(object): """Instance is a database for KMeans clustering. The data is stored as a list of list of numbers (e.g. a 2d list). Each row is a data point. The contents of each point may be either ints or floats. Instance Attributes: _dimension: the point dimension for this database [int > 0. Value NEVER changes after initialization] _ksize: the k value for k-means clustering [int >= 0] _contents: the database contents [either a 2D list of numbers (int or float) or empty list, the number of columns (len(_contents[0]) is _dimension] _clusters: the Cluster objects for the k-means algorithm [list of Clusters or empty list, len(_clusters) = _ksize] None of these attributes should be accessed directly outside of a class Cluster. Instead, this class has getter and setter style methods (with the appropriate preconditions) for modifying these values.""" # PART A (MUST MODIFY THIS METHOD HEADER) def __init__(self,dim,contents): """Initializer: Makes a database for the given point dimension. The parameter dim is the initial value for attribute _dimension. The optional parameter contents is the initial value of the attribute _contents. When assigning contents to the attribute _contents it COPIES the list contents. If contents is None, the initializer assigns _contents an empty list. The parameter contents is None by default. The initial value of the attribute _ksize is 0. The also means that the intial value of the attribute _clusters is the empty list. Precondition: dim is an int > 0. contents is either None or it is a 2D list of numbers (int or float). If contents is not None, then contents is not empty and the number of columns of contents is equal to dim.""" # IMPLEMENT ME pass def getDimension(self): """Return: The point dimension of this database""" # IMPLEMENT ME pass def getKSize(self): """Return: The k value for k-means clustering""" # IMPLEMENT ME pass def getContents(self): """Returns: The database contents""" # IMPLEMENT ME pass def appendContents(self,point): """Adds a COPY of point at the end of _contents. This method does not add the point directly. It adds a copy of the point. Precondition: point is a list of numbers (int or float), len(point) = _dimension.""" # IMPLEMENT ME pass def clearContents(self): """Deletes all the database contents, making _contents an empty list.""" # IMPLEMENT ME pass def getCluster(self,pos): """Returns: The cluster at position pos in _clusters Recall that _clusters is a list. The proper way to construct a getter is one that takes a position as argument and returns the cluster at that position. Precondition: pos is an int in 0..k-1, where k is the k-value""" # IMPLEMENT ME pass # Part C def nearest(self,point): """Returns: Cluster nearest to this point This method uses the distance method in each Cluster to compute the distance between point and the cluster centroid. It returns the Cluster that is the closest. If _ksize is 0 (there are no clusters), it returns None. Precondition: point is a list of numbers (int or float), len(point) = _dimension.""" # IMPLEMENT ME pass def partition(self): """Repartitions the database so each point is in exactly one Cluster. This method clears all of the Clusters. Then, for each point in the database, this method 1. Finds the nearest cluster. 2. Adds that point to the cluster. Precondition: The attribute _ksize is not 0 (Note this is a precondition on an attribute, not a parameter).""" # IMPLEMENT ME pass # PART D def setKSize(self,k): """Updates the clusters and sets _ksize to k. This method initializes the database to use the correct k. If the database already has Clusters (because _ksize > 0), then it removes the old clusters and replaces them with new ones. This method should set the _ksize attribute to k. If k is 0, then _clusters should be set to an empty list. If k > 0, this method partitions the database contents into the new contents as its final step. Precondition: k is an int >= 0.""" # IMPLEMENT ME pass # PART E def step(self): """Return: True if the process converges after a single k-means step. This method processes a single step in k-means clustering. It updates each Cluster to compute a new Centroid (via the update() method). Using these new centroids, it repartitions the database contents so that each point is in the appropriate cluster. This method returns true if the update is stable (see the update() method specification in class Cluster). It returns False otherwise.""" # IMPLEMENT ME pass class Cluster(object): """Instance represents a cluster of data in k-means clustering. Instance Attributes: _centroid: the cluster centroid [list of number (int or float)] _dimension: the point dimension of this cluster [int > 0. len(_centroid) = _dimension Value NEVER changes after initialization] _contents: the current data in the cluster [either a 2D list of numbers (int or float) or empty list, the number of columns (len(_contents[0]) is _dimension] _srating: the sum of all ratings (for market analysis) [float >= 0] _crating: the number of ratings (for market analysis) [float >= 0] None of these attributes should be accessed directly outside of a class Cluster. Instead, this class has getter and setter style methods (with the appropriate preconditions) for modifying these values.""" # PART B def __init__(self,centroid): """Initializer: Makes the cluster with given centroid. The parameter centroid is the initial value for attribute _centroid. The initializer COPIES the centroid before assigning it to the attribute _centroid. The _dimension attribute set to the length of centroid. The _contents attribute is initialize to an empty list. The attributes _srating and _crating are initialized to 0. Precondition: centroid is a list of numbers (int or float)""" # IMPLEMENT ME pass def getDimension(self): """Return: The point dimension of this cluster""" # IMPLEMENT ME pass def getCentroid(self): """Returns: The cluster centroid""" # IMPLEMENT ME pass def setCentroid(self,point): """Changes the value of the cluster centroid. This method COPIES point when it assigns it as the new centroid. Precondition: point is a list of numbers with length _dimension""" # IMPLEMENT ME pass def getContents(self): """Returns: The current data in the cluster.""" # IMPLEMENT ME pass def appendContents(self,point): """Adds a COPY of point at the end of _contents. This method does not add the point directly. It adds a copy of the point. Precondition: point is a list of numbers (int or float), len(point) = _dimension.""" # IMPLEMENT ME pass def clearContents(self): """Deletes all the cluster contents, making _contents an empty list.""" # IMPLEMENT ME pass # Part C def distance(self,point): """Returns: Distance from point to _centroid. This method uses Euclidean distance formula for the calculation. See http://en.wikipedia.org/wiki/Euclidean_distance Precondition: point is a list of numbers (int or float), len(point) = _dimension.""" # IMPLEMENT ME pass # Part E def update(self): """Returns: Trues if the centroid remains the same after recomputation, and False otherwise. This method recomputes the _centroid attribute of this cluster. The new _centroid attribute is the average of the points of _contents (To average a point, average each coordinate separately). If the centroid remains unchanged after recomputation, the process has stabilized, and so this method returns True. Otherwise it returns False. If _contents are empty, the _centroid attribute is reset to the origin (the coordinates are all 0's). While there are other ways to handle empty clusters (randomly picking a new point), it requires tighter cooridination with the Database object. This solution allows us to simplify the code a lot.""" # IMPLEMENT ME pass # PART F: def getRating(self): """Returns: Rating of this cluster. The rating is an average of all of the ratings assigned to this cluster (e.g. it is the value _srating/_crating). If no ratings have yet been assigned (_crating is 0), this this method returns None.""" # IMPLEMENT ME pass def addRating(self,rating): """Adds a rating to this cluster. The rating is averaged with all of those added so far. To do this, it adds the rating to _srating and increments _crating by 1. Precondition: rating is a float between 0 and 5 (inclusive)""" # IMPLEMENT ME pass def clearRating(self): """Clears the rating for this cluster. Both _srating and _crating are set to 0.""" # IMPLEMENT ME pass # PART II: Analysis # HELPER FUNCTIONS SHOULD GO HERE # ANALYSIS FUNCTION def best_cluster(filename,k,maxstep,cutoff): """Returns: Cluster object with highest rating after clustering. The function opens and reads from the "candy file" filename. Each line of this file has a candy name, sweetness, sourness, nuttiness, texture, and a popularity rating. The values sweetness, sourness, nuttiness, and texture are between 0 and 1. Ratings are between 0 and 5. The function selects only those candies whose rating > cutoff and constructs a Database from them. DO NOT INCLUDE THE RATING WHEN ADDING A CANDY TO THE DATABASE. If there are no candies above the cutoff, this function should immediately stop and return None. The function then runs k-means clustering for the given k value. It runs until the clusters converge (step() returns true) or until maxsteps have passed, whichever comes first. The function then computes the rating for each cluster. It uses the addRating()/clearRating() methods in Cluster to add each candy rating. When computing a rating for each cluster, it considers ALL candies, not just those with ratings above a cut-off. See the instructions for more information. When done, this function returns the Cluster object (use getCluster in the Database object) with the highest rating. Precondition: filename a string. k, maxstep > 0 are ints. cutoff is either an int or a float between 0 and 5 (inclusive)""" # YOU GET NO MORE THAN 30 LINES STARTING NOW (including asserts) pass