scikit-learnhierarchical-clusteringdbscanhdbscan

clustering for a single timeseries


I have a single array numpy array(x) and i want to cluster it in unsupervised way using DBSCAN and hierarchial clustering using scikitlearn. Is the clustering possible for single array data? Additionally i need to plot the clusters and its corresponding representation on the input data.

I tried

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from scipy import stats
import scipy.cluster.hierarchy as hac
#my data

x = np.linspace(0, 500, 10000)
x = 1.5 * np.sin(x)
#dbscan
clustering = DBSCAN(eps=3).fit(x)
# here i am facing problem
# hierarchial

Solution

  • Yes, DBSCAN can cluster "1-D" arrays. See time series below, although I don't know the significance of clustering just the waveform.

    For example,

    import numpy as np
    
    rng =np.random.default_rng(42)
    x=rng.normal(loc=[-10,0,0,0,10], size=(200,5)).reshape(-1,1)
    rng.shuffle(x)
    print(x[:10])
    # [[-10.54349551]
    #  [ -0.32626201]
    #  [  0.22359555]
    #  [ -0.05841124]
    #  [ -0.11761086]
    #  [ -1.0824272 ]
    #  [  0.43476607]
    #  [ 11.40382139]
    #  [  0.70166365]
    #  [  9.79889535]]
    
    from sklearn.cluster import DBSCAN
    
    dbs=DBSCAN()
    clusters = dbs.fit_predict(x)
    
    import matplotlib.pyplot as plt
    plt.scatter(x,np.zeros(len(x)), c=clusters)
    

    enter image description here

    You can use AgglomerativeClustering for hierarchical clustering.

    Here's an example using the data from above.

    from sklearn.cluster import AgglomerativeClustering
    
    aggC = AgglomerativeClustering(n_clusters=None, distance_threshold=1.0, linkage="single")
    clusters = aggC.fit_predict(x)
    
    plt.scatter(x,np.zeros(len(x)), c=clusters)
    

    enter image description here

    Time Series / Waveform (no other features)

    You can do it, but with no features other than time and signal amplitude, I don't know if this has any meaning.

    import numpy as np
    from scipy import signal
    
    y = np.hstack((np.zeros(100), signal.square(2*np.pi*np.linspace(0,2,200, endpoint=False)), np.zeros(100), signal.sawtooth(2*np.pi*np.linspace(0,2,200, endpoint=False)+np.pi/2,width=0.5), np.zeros(100), np.sin(2*np.pi*np.linspace(0,2,200,endpoint=False)), np.zeros(100)))
    
    import datetime
    
    start = datetime.datetime.fromisoformat("2022-12-01T12:00:00.000000")
    
    times = np.array([(start+datetime.timedelta(microseconds=_)).timestamp() for _ in range(1000)])
    
    my_sig = np.hstack((times.reshape(-1,1),y.reshape(-1,1)))
    
    print(my_sig[:5,:])
    # [[1.6698924e+09 0.0000000e+00]
    #  [1.6698924e+09 0.0000000e+00]
    #  [1.6698924e+09 0.0000000e+00]
    #  [1.6698924e+09 0.0000000e+00]
    #  [1.6698924e+09 0.0000000e+00]]
    
    from sklearn.cluster import AgglomerativeClustering
    
    aggC = AgglomerativeClustering(n_clusters=None, distance_threshold=4.0)
    
    clusters = aggC.fit_predict(my_sig)
    
    import matplotlib.pyplot as plt
    plt.scatter(my_sig[:,0], my_sig[:,1], c=clusters)
    

    enter image description here