# a5.py
# YOUR NAME(S) AND NETID(S) HERE
# DATE COMPLETED HERE
"""Module for K-Means Clustering

WRITE YOUR ESSAY HERE"""
import random
import numpy
import math

# HELPER FUNCTION FOR PART D
def random_range(a,b,size):
    """Returns: A list of length size, with ints between a and b.
    
    The list is guaranteed to contain no duplicates, but is otherwise
    random. This is a helper function to make it easy to pick a list 
    of distinct rows from a table.

    Example: random_range(1,5,3) => [2, 1, 4]
    
    Precondition: a, b, size are ints.  b-a > size."""
    assert type(a) == int, 'a='+`a`+' is not an int'
    assert type(b) == int, 'b='+`b`+' is not an int'
    assert type(size) == int, 'size='+`size`+' is not an int'
    assert b-a > size, 'size '+`size`+' exceeds range '+`a`+' to '+`b`
    
    result = []
    while (len(result) < size):
        pick = random.randint(a,b-1)
        # Linear search if there is a collision.
        while pick in result:
            pick = (pick+1) % (b-a)
        result.append(pick)
    return result


# HELPER FUNCTIONS FOR ASSERTS GO HERE


# PART I: Classes
class Database(object):
    """Instance is a database for KMeans clustering.
    
    The data is stored as a list of list of numbers 
    (e.g. a 2d list).  Each row is a data point. The contents of
    each point may be either ints or floats.
    
    Instance Attributes:
        _dimension: the point dimension for this database 
                    [int > 0. Value NEVER changes after initialization]            
        _ksize:     the k value for k-means clustering [int >= 0]
        _contents:  the database contents
                    [either a 2D list of numbers (int or float) or empty list,
                     the number of columns (len(_contents[0]) is _dimension]
        _clusters:  the Cluster objects for the k-means algorithm
                    [list of Clusters or empty list, len(_clusters) = _ksize]

    None of these attributes should be accessed directly outside
    of a class Cluster.  Instead, this class has getter and setter
    style methods (with the appropriate preconditions) for modifying
    these values."""
    
    # PART A (MUST MODIFY THIS METHOD HEADER)
    def __init__(self,dim,contents):
        """Initializer: Makes a database for the given point dimension.
    
        The parameter dim is the initial value for attribute _dimension.  
  
        The optional parameter contents is the initial value of the
        attribute _contents. When assigning contents to the attribute
        _contents it COPIES the list contents. If contents is None, the 
        initializer assigns _contents an empty list. The parameter contents 
        is None by default.
        
        The initial value of the attribute _ksize is 0. The also means that
        the intial value of the attribute _clusters is the empty list.
        
        Precondition: dim is an int > 0. contents is either None or 
        it is a 2D list of numbers (int or float). If contents is not None, 
        then contents is not empty and the number of columns of contents is 
        equal to dim."""
        # IMPLEMENT ME
        pass

    def getDimension(self):
        """Return: The point dimension of this database"""
        # IMPLEMENT ME
        pass
    
    def getKSize(self):
        """Return: The k value for k-means clustering"""
        # IMPLEMENT ME
        pass

    def getContents(self):
        """Returns: The database contents"""
        # IMPLEMENT ME
        pass

    def appendContents(self,point):
        """Adds a COPY of point at the end of _contents.
        
        This method does not add the point directly. It adds
        a copy of the point.

        Precondition: point is a list of numbers (int or float), 
        len(point) = _dimension."""
        # IMPLEMENT ME
        pass
        
    def clearContents(self):
        """Deletes all the database contents, making _contents an empty list."""
        # IMPLEMENT ME
        pass 

    def getCluster(self,pos):
        """Returns: The cluster at position pos in _clusters
        
        Recall that _clusters is a list. The proper way to construct
        a getter is one that takes a position as argument and returns 
        the cluster at that position.
        
        Precondition: pos is an int in 0..k-1, where k is the k-value"""
        # IMPLEMENT ME
        pass
        
    
    # Part C
    def nearest(self,point):
        """Returns: Cluster nearest to this point
       
        This method uses the distance method in each Cluster to compute 
        the distance between point and the cluster centroid. It returns 
        the Cluster that is the closest.
        
        If _ksize is 0 (there are no clusters), it returns None.
       
        Precondition: point is a list of numbers (int or float), 
        len(point) = _dimension."""
        # IMPLEMENT ME
        pass       
    
    def partition(self):
        """Repartitions the database so each point is in exactly one Cluster.

        This method clears all of the Clusters.  Then, for each point in the 
        database, this method
          1. Finds the nearest cluster.
          2. Adds that point to the cluster.
          
        Precondition: The attribute _ksize is not 0 (Note this is a 
        precondition on an attribute, not a parameter)."""
        # IMPLEMENT ME
        pass
            

    # PART D
    def setKSize(self,k):
        """Updates the clusters and sets _ksize to k.
        
        This method initializes the database to use the correct k.
        If the database already has Clusters (because _ksize > 0),
        then it removes the old clusters and replaces them with
        new ones.
        
        This method should set the _ksize attribute to k.

        If k is 0, then _clusters should be set to an empty list.
        
        If k > 0, this method partitions the database contents
        into the new contents as its final step.

        Precondition: k is an int >= 0."""
        # IMPLEMENT ME
        pass
    

    # PART E
    def step(self):
        """Return: True if the process converges after a single k-means step.
        
        This method processes a single step in k-means clustering.  It
        updates each Cluster to compute a new Centroid (via the update() 
        method).  Using these new centroids, it repartitions the database 
        contents so that each point is in the appropriate cluster.
        
        This method returns true if the update is stable (see the update() 
        method specification in class Cluster). It returns False otherwise."""
        # IMPLEMENT ME
        pass


class Cluster(object):
    """Instance represents a cluster of data in k-means clustering.
    
    Instance Attributes:
        _centroid:  the cluster centroid
                    [list of number (int or float)]
        _dimension: the point dimension of this cluster
                    [int > 0. len(_centroid) = _dimension
                     Value NEVER changes after initialization]
        _contents:  the current data in the cluster 
                    [either a 2D list of numbers (int or float) or empty list,
                     the number of columns (len(_contents[0]) is _dimension]

        _srating:   the sum of all ratings (for market analysis) 
                    [float >= 0]
        _crating:   the number of ratings (for market analysis)  
                    [float >= 0]

    None of these attributes should be accessed directly outside
    of a class Cluster.  Instead, this class has getter and setter
    style methods (with the appropriate preconditions) for modifying
    these values."""

    # PART B
    def __init__(self,centroid):
        """Initializer: Makes the cluster with given centroid.
        
        The parameter centroid is the initial value for attribute _centroid. 
        The initializer COPIES the centroid before assigning it to the
        attribute _centroid.

        The _dimension attribute set to the length of centroid. The
        _contents attribute is initialize to an empty list.
        
        The attributes _srating and _crating are initialized to 0.
        
        Precondition: centroid is a list of numbers (int or float)"""
        # IMPLEMENT ME
        pass
        
    def getDimension(self):
        """Return: The point dimension of this cluster"""
        # IMPLEMENT ME
        pass

    def getCentroid(self):
        """Returns: The cluster centroid"""
        # IMPLEMENT ME
        pass

    def setCentroid(self,point):
        """Changes the value of the cluster centroid.
        
        This method COPIES point when it assigns it as the new centroid.
        
        Precondition: point is a list of numbers with length _dimension"""
        # IMPLEMENT ME
        pass

    def getContents(self):
        """Returns: The current data in the cluster."""
        # IMPLEMENT ME
        pass

    def appendContents(self,point):
        """Adds a COPY of point at the end of _contents.
        
        This method does not add the point directly. It adds
        a copy of the point.
        
        Precondition: point is a list of numbers (int or float), 
        len(point) = _dimension."""
        # IMPLEMENT ME
        pass

    def clearContents(self):
        """Deletes all the cluster contents, making _contents an empty list."""
        # IMPLEMENT ME
        pass


    # Part C
    def distance(self,point):
        """Returns: Distance from point to _centroid.
        
        This method uses Euclidean distance formula for the calculation.
        See http://en.wikipedia.org/wiki/Euclidean_distance
        
        Precondition: point is a list of numbers (int or float), 
        len(point) = _dimension."""
        # IMPLEMENT ME
        pass


    # Part E
    def update(self):
        """Returns: Trues if the centroid remains the same after recomputation,
        and False otherwise.
        
        This method recomputes the _centroid attribute of this cluster. The
        new _centroid attribute is the average of the points of _contents
        (To average a point, average each coordinate separately).  
        
        If the centroid remains unchanged after recomputation, the process has 
        stabilized, and so this method returns True.  Otherwise it returns 
        False.
        
        If _contents are empty, the _centroid attribute is reset to the origin 
        (the coordinates are all 0's).  While there are other ways to handle 
        empty clusters (randomly picking a new point), it requires tighter 
        cooridination with the Database object.  This solution allows us to 
        simplify the code a lot."""
        # IMPLEMENT ME
        pass


    # PART F:    
    def getRating(self):
        """Returns: Rating of this cluster.
        
        The rating is an average of all of the ratings assigned to this
        cluster (e.g. it is the value _srating/_crating). If no ratings 
        have yet been assigned (_crating is 0), this this method returns 
        None."""
        # IMPLEMENT ME
        pass
    
    def addRating(self,rating):
        """Adds a rating to this cluster.
        
        The rating is averaged with all of those added so far.
        To do this, it adds the rating to _srating and increments
        _crating by 1.
        
        Precondition: rating is a float between 0 and 5 (inclusive)"""
        # IMPLEMENT ME
        pass

    def clearRating(self):
        """Clears the rating for this cluster.
        
        Both _srating and _crating are set to 0."""
        # IMPLEMENT ME
        pass

    
# PART II: Analysis

# HELPER FUNCTIONS SHOULD GO HERE  

    
# ANALYSIS FUNCTION
def best_cluster(filename,k,maxstep,cutoff):
    """Returns: Cluster object with highest rating after clustering.
    
    The function opens and reads from the "candy file" filename.
    Each line of this file has a candy name, sweetness, sourness,
    nuttiness, texture, and a popularity rating.  The values
    sweetness, sourness, nuttiness, and texture are between 0 and 1.
    Ratings are between 0 and 5.
    
    The function selects only those candies whose rating > cutoff
    and constructs a Database from them.  DO NOT INCLUDE THE
    RATING WHEN ADDING A CANDY TO THE DATABASE. If there are no
    candies above the cutoff, this function should immediately
    stop and return None.
    
    The function then runs k-means clustering for the given k value.  
    It runs until the clusters converge (step() returns true) or 
    until maxsteps have passed, whichever comes first.
    
    The function then computes the rating for each cluster.  It
    uses the addRating()/clearRating() methods in Cluster to 
    add each candy rating.  When computing a rating for each cluster, 
    it considers ALL candies, not just those with ratings above a cut-off.
    See the instructions for more information.
    
    When done, this function returns the Cluster object 
    (use getCluster in the Database object) with the highest rating.
    
    Precondition: filename a string. k, maxstep > 0 are ints.
    cutoff is either an int or a float between 0 and 5 (inclusive)"""
    # YOU GET NO MORE THAN 30 LINES STARTING NOW (including asserts)
    pass