Skip to content

Modules

Arcos4py top level module.

This package is a python implementation of the Arcos algorithm for the detection and tracking of collective events intime-series data.

ARCOS(data, position_columns=['x'], frame_column='time', obj_id_column='id', measurement_column='meas', clid_column='clTrackID', n_jobs=1, **kwargs)

Detects and tracks collective events in a tracked time-series dataset.

Requires binarized measurement column, that can be generated with the bin_measurements method. Tracking makes use of the dbscan algorithm, which is applied to every frame and subsequently connects collective events between frames located within eps distance of each other.

Attributes:

Name Type Description
data DataFrame

Data of tracked time-series in "long format". Can be used to acess modified dataframe at any point.

position_columns list

List containing position column names strings inside data e.g. At least one dimension is required.

frame_column str

Indicating the frame column in input_data.

obj_id_column str

Indicating the track id/id column in input_data.

measurement_column str

Indicating the measurement column in input_data.

clid_column str

Indicating the column name containing the collective event ids.

binarized_measurement_column str | None

Name of the binary column. This is generated based on the name of the measurement_column after binarization. Optionally can be set in order to provide a already binarized column to skip ARCOS binarization.

Parameters:

Name Type Description Default
data DataFrame

Input Data of tracked time-series in "long format" containing position columns, a measurement and an object ID column.

required
position_columns list

List ontaining position column names strings inside data e.g. At least one dimension is required.

['x']
frame_column str

Indicating the frame column in input_data.

'time'
obj_id_column str

Indicating the track id/object id column in input_data. If None, the data is assumed to not have a tracking column. Binarization can only be performed without detrending.

'id'
measurement_column str

Indicating the measurement column in input_data.

'meas'
clid_column str

Indicating the column name containing the collective event ids.

'clTrackID'
n_jobs str

Number of workers to spawn, -1 uses all available cpus.

1
kwargs Any

Additional keyword arguments. Includes old parameter names for backwards compatibility. - posCols: List containing position column names strings inside data e.g.

{}
Source code in arcos4py/_arcos4py.py
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
def __init__(
    self,
    data: pd.DataFrame,
    position_columns: list = ["x"],
    frame_column: str = 'time',
    obj_id_column: str | None = 'id',
    measurement_column: str = 'meas',
    clid_column: str = 'clTrackID',
    n_jobs: int = 1,
    **kwargs,
) -> None:
    """Constructs class with provided arguments.

    Arguments:
        data (DataFrame): Input Data of tracked time-series in "long format" containing position columns,
            a measurement and an object ID column.
        position_columns (list): List ontaining position column names strings inside data e.g.
            At least one dimension is required.
        frame_column (str): Indicating the frame column in input_data.
        obj_id_column (str): Indicating the track id/object id column in input_data. If None, the data is assumed to
            not have a tracking column. Binarization can only be performed without detrending.
        measurement_column (str): Indicating the measurement column in input_data.
        clid_column (str): Indicating the column name containing the collective event ids.
        n_jobs (str): Number of workers to spawn, -1 uses all available cpus.
        kwargs (Any): Additional keyword arguments. Includes old parameter names for backwards compatibility.
            - posCols: List containing position column names strings inside data e.g.
    """
    # allowed kwargs
    allowed_kwargs = ["posCols", "id_column"]
    for key in kwargs:
        if key not in allowed_kwargs:
            raise ValueError(f"__init__() got an unexpected keyword argument '{key}'")
    # Handle deprecated parameters
    param_mapping = {
        "posCols": "position_columns",
        "id_column": "obj_id_column",
    }
    updated_kwargs = handle_deprecated_params(param_mapping, **kwargs)

    # Assign updated kwargs to class attributes
    position_columns = updated_kwargs.get("position_columns", position_columns)
    obj_id_column = updated_kwargs.get("obj_id_column", obj_id_column)

    self.data = data
    self.position_columns = position_columns
    self.frame_column = frame_column
    self.obj_id_column = obj_id_column
    self.measurement_column = measurement_column
    self.clid_column = clid_column
    self.n_jobs = n_jobs

    self.binarized_measurement_column: Union[str, None] = None
    # to check if no measurement was provided assign None
    if self.obj_id_column is None:
        self.data = self.data.sort_values(by=[self.frame_column])
    else:
        self.data = self.data.sort_values(by=[self.frame_column, self.obj_id_column])
    self._check_col()
    if self.measurement_column is not None:
        self.resc_col = f"{self.measurement_column}.resc"
        self.binarized_measurement_column = f"{self.measurement_column}.bin"

bin_col: str | None property writable

Return the name of the binarized measurement column.

id_column: str | None property writable

Return the name of the id column.

posCols: list property writable

Return the position columns.

bin_measurements(smooth_k=3, bias_k=51, peak_threshold=0.2, binarization_threshold=0.1, polynomial_degree=1, bias_method='runmed', **kwargs)

Smooth, de-trend, and binarise the input data.

First a short-term median filter with size smoothK is applied to remove fast noise from the time series. If the de-trending method is set to "none", smoothing is applied on globally rescaled time series. The subsequent de-trending can be performed with a long-term median filter with the size biasK {biasMet = "runmed"} or by fitting a polynomial of degree polyDeg {biasMet = "lm"}.

After de-trending, if the global difference between min/max is greater than the threshold the signal is rescaled to the (0,1) range. The final signal is binarised using the binThr threshold

Parameters:

Name Type Description Default
smooth_k int

Size of the short-term median smoothing filter.

3
bias_k int

Size of the long-term de-trending median filter

51
peak_threshold float

Threshold for rescaling of the de-trended signal.

0.2
binarization_threshold float

Threshold for binary classification.

0.1
polynomial_degree int

Sets the degree of the polynomial for lm fitting.

1
bias_method str

De-trending method, one of ['runmed', 'lm', 'none']. If no id_column is provided, only 'none' is allowed.

'runmed'
**kwargs Any

Additional keyword arguments. Includes old parameter names for backwards compatibility. - smoothK: Size of the short-term median smoothing filter. - biasK: Size of the long-term de-trending median filter - peakThr: Threshold for rescaling of the de-trended signal. - binThr: Threshold for binary classification. - polyDeg: Sets the degree of the polynomial for lm fitting. - biasMet: De-trending method, one of ['runmed', 'lm', 'none'].

{}

Returns:

Type Description
DataFrame

DataFrame with detrended/smoothed and binarized measurement column.

Source code in arcos4py/_arcos4py.py
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
def bin_measurements(
    self,
    smooth_k: int = 3,
    bias_k: int = 51,
    peak_threshold: float = 0.2,
    binarization_threshold: float = 0.1,
    polynomial_degree: int = 1,
    bias_method: str = "runmed",
    **kwargs,
) -> pd.DataFrame:
    r"""Smooth, de-trend, and binarise the input data.

    First a short-term median filter with size smoothK
    is applied to remove fast noise from the time series.
    If the de-trending method is set to "none",
    smoothing is applied on globally rescaled time series.
    The subsequent de-trending can be performed with a long-term median filter
    with the size biasK {biasMet = "runmed"}
    or by fitting a polynomial of degree polyDeg {biasMet = "lm"}.

    After de-trending,
    if the global difference between min/max is greater than the threshold
    the signal is rescaled to the (0,1) range.
    The final signal is binarised using the binThr threshold

    Arguments:
        smooth_k (int): Size of the short-term median smoothing filter.
        bias_k (int): Size of the long-term de-trending median filter
        peak_threshold (float): Threshold for rescaling of the de-trended signal.
        binarization_threshold (float): Threshold for binary classification.
        polynomial_degree (int): Sets the degree of the polynomial for lm fitting.
        bias_method (str): De-trending method, one of ['runmed', 'lm', 'none'].
            If no id_column is provided, only 'none' is allowed.
        **kwargs (Any): Additional keyword arguments. Includes old parameter names for backwards compatibility.
            - smoothK: Size of the short-term median smoothing filter.
            - biasK: Size of the long-term de-trending median filter
            - peakThr: Threshold for rescaling of the de-trended signal.
            - binThr: Threshold for binary classification.
            - polyDeg: Sets the degree of the polynomial for lm fitting.
            - biasMet: De-trending method, one of ['runmed', 'lm', 'none'].

    Returns:
        DataFrame with detrended/smoothed and binarized measurement column.
    """
    # allowed kwargs
    param_mapping = {
        "smoothK": "smooth_k",
        "biasK": "bias_k",
        "peakThr": "peak_threshold",
        "binThr": "binarization_threshold",
        "polyDeg": "polynomial_degree",
        "biasMet": "bias_method",
    }
    # allowed kwargs
    allowed_kwargs = param_mapping.keys()
    for key in kwargs:
        if key not in allowed_kwargs:
            raise ValueError(f"bin_measurements() got an unexpected keyword argument '{key}'")

    updated_kwargs = handle_deprecated_params(param_mapping, **kwargs)

    smooth_k = updated_kwargs.get("smooth_k", smooth_k)
    bias_k = updated_kwargs.get("bias_k", bias_k)
    peak_threshold = updated_kwargs.get("peak_threshold", peak_threshold)
    binarization_threshold = updated_kwargs.get("binarization_threshold", binarization_threshold)
    polynomial_degree = updated_kwargs.get("polynomial_degree", polynomial_degree)
    bias_method = updated_kwargs.get("bias_method", bias_method)

    self.data = binData(
        smooth_k,
        bias_k,
        peak_threshold,
        binarization_threshold,
        polynomial_degree,
        bias_method,
        n_jobs=self.n_jobs,
    ).run(
        self.data,
        measurement_column=self.measurement_column,
        group_column=self.obj_id_column,
        frame_column=self.frame_column,
    )
    return self.data

clip_meas(clip_low=0.001, clip_high=0.999)

Clip measurement column to upper and lower quantiles defined in clip_low and clip_high.

Parameters:

Name Type Description Default
clip_low float

Lower clipping boundary (quantile).

0.001
clip_high float

Upper clipping boundary (quantile).

0.999

Returns:

Type Description
DataFrame

Dataframe with in place clipped measurement column.

Source code in arcos4py/_arcos4py.py
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
def clip_meas(self, clip_low: float = 0.001, clip_high: float = 0.999) -> pd.DataFrame:
    """Clip measurement column to upper and lower quantiles defined in clip_low and clip_high.

    Arguments:
        clip_low (float): Lower clipping boundary (quantile).

        clip_high (float): Upper clipping boundary (quantile).

    Returns:
        Dataframe with in place clipped measurement column.
    """
    # Issue a deprecation warning
    warnings.warn(
        "The 'clip_meas' method is deprecated and will be removed in a future version.\
        Please use 'clip_measurements' instead.",
        DeprecationWarning,
        stacklevel=2,
    )
    return self.clip_measurements(clip_low, clip_high)

clip_measurements(clip_low=0.001, clip_high=0.999)

Clip measurement column to upper and lower quantiles defined in clip_low and clip_high.

Parameters:

Name Type Description Default
clip_low float

Lower clipping boundary (quantile).

0.001
clip_high float

Upper clipping boundary (quantile).

0.999

Returns:

Type Description
DataFrame

Dataframe with in place clipped measurement column.

Source code in arcos4py/_arcos4py.py
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
def clip_measurements(self, clip_low: float = 0.001, clip_high: float = 0.999) -> pd.DataFrame:
    """Clip measurement column to upper and lower quantiles defined in clip_low and clip_high.

    Arguments:
        clip_low (float): Lower clipping boundary (quantile).

        clip_high (float): Upper clipping boundary (quantile).

    Returns:
        Dataframe with in place clipped measurement column.
    """
    meas_column = self.data[self.measurement_column].to_numpy()
    meas_clipped = clipMeas(meas_column).clip(clip_low, clip_high)
    self.data[self.measurement_column] = meas_clipped
    return self.data

interpolate_measurements()

Interpolates NaN's in place in measurement column.

Returns:

Type Description
DataFrame

Dataframe with interpolated measurement column.

Source code in arcos4py/_arcos4py.py
131
132
133
134
135
136
137
138
139
def interpolate_measurements(self) -> pd.DataFrame:
    """Interpolates NaN's in place in measurement column.

    Returns:
        Dataframe with interpolated measurement column.
    """
    meas_interp = interpolation(self.data).interpolate()
    self.data = meas_interp
    return self.data

trackCollev(eps=1, eps_prev=None, min_clustersize=1, n_prev=1, clustering_method='dbscan', linking_method='nearest', min_samples=None, **kwargs)

Detects and tracks collective events in a tracked time-series dataset.

Makes use of the dbscan algorithm, applies this to every timeframe and subsequently connects collective events between frames located within eps distance of each other.

Parameters:

Name Type Description Default
eps float

The maximum distance between two samples for one to be considered as in the neighbourhood of the other. This is not a maximum bound on the distances of points within a cluster.

1
eps_prev float | None

Frame to frame distance, value is used to connect collective events across multiple frames.If "None", same value as eps is used.

None
min_clustersize int

The minimum size for a cluster to be identified as a collective event

1
n_prev int

Number of previous frames the tracking algorithm looks back to connect collective events

1
clustering_method str

Clustering method, one of ['dbscan', 'hdbscan'].

'dbscan'
min_samples int | None

The number of samples (or total weight) in a neighbourhood for a point to be considered as a core point. This includes the point itself. Only used if clustering_method is 'hdbscan'. If None, min_samples = min_clustersize.

None
linking_method str

Linking method, one of ['nearest', 'transportation'].

'nearest'
**kwargs Any

Additional keyword arguments. Includes old parameter names for backwards compatibility. - epsPrev: Frame to frame distance, value is used to connect collective events across multiple frames. - minClsz: The minimum size for a cluster to be identified as a collective event - nPrev: Number of previous frames the tracking algorithm looks back to connect collective events - clusteringMethod: Clustering method, one of ['dbscan', 'hdbscan']. - minSamples: The number of samples (or total weight) in a neighbourhood for a point to be considered as a core point. This includes the point itself. Only used if clustering_method is 'hdbscan'. If None, min_samples = min_clustersize. - linkingMethod: Linking method, one of ['nearest', 'transportation'].

{}

Returns:

Type Description
DataFrame

DataFrame with detected collective events across time.

Source code in arcos4py/_arcos4py.py
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
def trackCollev(
    self,
    eps: float = 1,
    eps_prev: Union[float, None] = None,
    min_clustersize: int = 1,
    n_prev: int = 1,
    clustering_method: str = "dbscan",
    linking_method: str = "nearest",
    min_samples: int | None = None,
    **kwargs,
) -> pd.DataFrame:
    """Detects and tracks collective events in a tracked time-series dataset.

    Makes use of the dbscan algorithm,
    applies this to every timeframe and subsequently connects
    collective events between frames located within eps distance of each other.

    Arguments:
        eps (float): The maximum distance between two samples for one to be considered as in
            the neighbourhood of the other.
            This is not a maximum bound on the distances of points within a cluster.
        eps_prev (float | None): Frame to frame distance, value is used to connect
            collective events across multiple frames.If "None", same value as eps is used.
        min_clustersize (int): The minimum size for a cluster to be identified as a collective event
        n_prev (int): Number of previous frames the tracking
            algorithm looks back to connect collective events
        clustering_method (str): Clustering method, one of ['dbscan', 'hdbscan'].
        min_samples (int | None): The number of samples (or total weight) in a neighbourhood for a
            point to be considered as a core point. This includes the point itself.
            Only used if clustering_method is 'hdbscan'. If None, min_samples =  min_clustersize.
        linking_method (str): Linking method, one of ['nearest', 'transportation'].
        **kwargs (Any): Additional keyword arguments. Includes old parameter names for backwards compatibility.
            - epsPrev: Frame to frame distance, value is used to connect
                collective events across multiple frames.
            - minClsz: The minimum size for a cluster to be identified as a collective event
            - nPrev: Number of previous frames the tracking
                algorithm looks back to connect collective events
            - clusteringMethod: Clustering method, one of ['dbscan', 'hdbscan'].
            - minSamples: The number of samples (or total weight) in a neighbourhood for a
                point to be considered as a core point. This includes the point itself.
                Only used if clustering_method is 'hdbscan'. If None, min_samples =  min_clustersize.
            - linkingMethod: Linking method, one of ['nearest', 'transportation'].

    Returns:
        DataFrame with detected collective events across time.
    """
    warnings.warn(
        "The 'trackCollev' method is deprecated and will be removed in a future version.\
            Please use 'track_collective_events' instead.",
        DeprecationWarning,
        stacklevel=2,
    )
    return self.track_collective_events(
        eps, eps_prev, min_clustersize, n_prev, clustering_method, linking_method, min_samples, **kwargs
    )

track_collective_events(eps=1, eps_prev=None, min_clustersize=1, n_prev=1, clustering_method='dbscan', linking_method='nearest', min_samples=None, **kwargs)

Detects and tracks collective events in a tracked time-series dataset.

Makes use of the dbscan algorithm, applies this to every timeframe and subsequently connects collective events between frames located within eps distance of each other.

Parameters:

Name Type Description Default
eps float

The maximum distance between two samples for one to be considered as in the neighbourhood of the other. This is not a maximum bound on the distances of points within a cluster.

1
eps_prev float | None

Frame to frame distance, value is used to connect collective events across multiple frames.If "None", same value as eps is used.

None
min_clustersize int

The minimum size for a cluster to be identified as a collective event

1
n_prev int

Number of previous frames the tracking algorithm looks back to connect collective events

1
clustering_method str

Clustering method, one of ['dbscan', 'hdbscan'].

'dbscan'
min_samples int | None

The number of samples (or total weight) in a neighbourhood for a point to be considered as a core point. This includes the point itself. Only used if clustering_method is 'hdbscan'. If None, min_samples = min_clustersize.

None
linking_method str

Linking method, one of ['nearest', 'transportation'].

'nearest'
**kwargs Any

Additional keyword arguments. Includes old parameter names for backwards compatibility. - epsPrev: Frame to frame distance, value is used to connect collective events across multiple frames. - minClsz: The minimum size for a cluster to be identified as a collective event - nPrev: Number of previous frames the tracking algorithm looks back to connect collective events - clusteringMethod: Clustering method, one of ['dbscan', 'hdbscan']. - minSamples: The number of samples (or total weight) in a neighbourhood for a point to be considered as a core point. This includes the point itself. Only used if clustering_method is 'hdbscan'. If None, min_samples = min_clustersize. - linkingMethod: Linking method, one of ['nearest', 'transportation'].

{}

Returns:

Type Description
DataFrame

DataFrame with detected collective events across time.

Source code in arcos4py/_arcos4py.py
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
def track_collective_events(
    self,
    eps: float = 1,
    eps_prev: Union[float, None] = None,
    min_clustersize: int = 1,
    n_prev: int = 1,
    clustering_method: str = "dbscan",
    linking_method: str = "nearest",
    min_samples: int | None = None,
    **kwargs,
) -> pd.DataFrame:
    """Detects and tracks collective events in a tracked time-series dataset.

    Makes use of the dbscan algorithm,
    applies this to every timeframe and subsequently connects
    collective events between frames located within eps distance of each other.

    Arguments:
        eps (float): The maximum distance between two samples for one to be considered as in
            the neighbourhood of the other.
            This is not a maximum bound on the distances of points within a cluster.
        eps_prev (float | None): Frame to frame distance, value is used to connect
            collective events across multiple frames.If "None", same value as eps is used.
        min_clustersize (int): The minimum size for a cluster to be identified as a collective event
        n_prev (int): Number of previous frames the tracking
            algorithm looks back to connect collective events
        clustering_method (str): Clustering method, one of ['dbscan', 'hdbscan'].
        min_samples (int | None): The number of samples (or total weight) in a neighbourhood for a
            point to be considered as a core point. This includes the point itself.
            Only used if clustering_method is 'hdbscan'. If None, min_samples =  min_clustersize.
        linking_method (str): Linking method, one of ['nearest', 'transportation'].
        **kwargs (Any): Additional keyword arguments. Includes old parameter names for backwards compatibility.
            - epsPrev: Frame to frame distance, value is used to connect
                collective events across multiple frames.
            - minClsz: The minimum size for a cluster to be identified as a collective event
            - nPrev: Number of previous frames the tracking
                algorithm looks back to connect collective events
            - clusteringMethod: Clustering method, one of ['dbscan', 'hdbscan'].
            - minSamples: The number of samples (or total weight) in a neighbourhood for a
                point to be considered as a core point. This includes the point itself.
                Only used if clustering_method is 'hdbscan'. If None, min_samples =  min_clustersize.
            - linkingMethod: Linking method, one of ['nearest', 'transportation'].

    Returns:
        DataFrame with detected collective events across time.
    """
    param_mapping = {
        "epsPrev": "eps_prev",
        "minClsz": "min_clustersize",
        "nPrev": "n_prev",
        "clusteringMethod": "clustering_method",
        "minSamples": "min_samples",
        "linkingMethod": "linking_method",
    }
    # allowed kwargs
    allowed_kwargs = param_mapping.keys()
    for key in kwargs:
        if key not in allowed_kwargs:
            raise ValueError(f"track_collective_events() got an unexpected keyword argument '{key}'")
    updated_kwargs = handle_deprecated_params(param_mapping, **kwargs)

    eps_prev = updated_kwargs.get("eps_prev", eps_prev)
    min_clustersize = updated_kwargs.get("min_clustersize", min_clustersize)
    n_prev = updated_kwargs.get("n_prev", n_prev)
    clustering_method = updated_kwargs.get("clustering_method", clustering_method)
    min_samples = updated_kwargs.get("min_samples", min_samples)
    linking_method = updated_kwargs.get("linking_method", linking_method)

    data_events = track_events_dataframe(
        X=self.data,
        position_columns=self.position_columns,
        frame_column=self.frame_column,
        id_column=self.obj_id_column,
        binarized_measurement_column=self.binarized_measurement_column,
        eps=eps,
        eps_prev=eps_prev,
        min_clustersize=min_clustersize,
        n_prev=n_prev,
        clid_column=self.clid_column,
        linking_method=linking_method,
        clustering_method=clustering_method,
        min_samples=min_samples,
        n_jobs=self.n_jobs,
    )

    return data_events

plotting

Tools for plotting collective events.

NoodlePlot(df, clid_column='collid', obj_id_column='obj_id', frame_column='frame', posx='x', posy='y', posz=None, **kwargs)

Create Noodle Plot of cell tracks, colored by collective event id.

Attributes:

Name Type Description
df DataFrame

DataFrame containing collective events from arcos.

colev str

Name of the collective event column in df.

trackid str

Name of the track column in df.

frame str

Name of the frame column in df.

posx str

Name of the X coordinate column in df.

posy str

Name of the Y coordinate column in df.

posz str

Name of the Z coordinate column in df, or None if no z column.

Parameters:

Name Type Description Default
df DataFrame

DataFrame containing collective events from arcos.

required
clid_column str

Name of the collective event column in df.

'collid'
obj_id_column str

Name of the track column in df.

'obj_id'
frame_column str

Name of the frame column in df.

'frame'
posx str

Name of the X coordinate column in df.

'x'
posy str

Name of the Y coordinate column in df.

'y'
posz str | None

Name of the Z coordinate column in df, or None if no z column.

None
**kwargs Any

Additional keyword arguments for plot. Includes deprecated parameters. - colev (str): Deprecated. Use clid_column instead. - trackid (str): Deprecated. Use obj_id_column instead. - frame (str): Deprecated. Use frame_column instead.

{}
Source code in arcos4py/plotting/_plotting.py
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
def __init__(
    self,
    df: pd.DataFrame,
    clid_column: str = "collid",
    obj_id_column: str = "obj_id",
    frame_column: str = "frame",
    posx: str = "x",
    posy: str = "y",
    posz: Union[str, None] = None,
    **kwargs,
):
    """Constructs class with given parameters.

    Arguments:
        df (pd.DataFrame): DataFrame containing collective events from arcos.
        clid_column (str): Name of the collective event column in df.
        obj_id_column (str): Name of the track column in df.
        frame_column (str): Name of the frame column in df.
        posx (str): Name of the X coordinate column in df.
        posy (str): Name of the Y coordinate column in df.
        posz (str | None): Name of the Z coordinate column in df,
            or None if no z column.
        **kwargs (Any): Additional keyword arguments for plot. Includes deprecated parameters.
            - colev (str): Deprecated. Use clid_column instead.
            - trackid (str): Deprecated. Use obj_id_column instead.
            - frame (str): Deprecated. Use frame_column instead.
    """
    map_deprecated_params = {
        "colev": "clid_column",
        "trackid": "obj_id_column",
        "frame": "frame_column",
    }

    # allowed matplotlib kwargs
    allowed_kwargs = [
        "alpha",
        "animated",
        "c",
        "label",
        "linewidth",
        "linestyle",
        "marker",
        "markersize",
        "markeredgecolor",
        "markerfacecolor",
        "markerfacecoloralt",
        "markeredgewidth",
        "path_effects",
        "picker",
        "pickradius",
        "solid_capstyle",
        "solid_joinstyle",
        "transform",
        "visible",
        "zorder",
    ]

    # check allowed kwargs
    allowed_kwargs_2 = map_deprecated_params.keys()
    for key in kwargs:
        if key not in allowed_kwargs and key not in allowed_kwargs_2:
            raise ValueError(f"Got an unexpected keyword argument '{key}'")

    updated_kwargs = handle_deprecated_params(map_deprecated_params, **kwargs)

    # Assigning the parameters
    clid_column = updated_kwargs.pop("clid_column", clid_column)
    obj_id_column = updated_kwargs.pop("obj_id_column", obj_id_column)
    frame_column = updated_kwargs.pop("frame_column", frame_column)

    self.df = df
    self.clid_column = clid_column
    self.obj_id_column = obj_id_column
    self.frame_column = frame_column
    self.posx = posx
    self.posy = posy
    self.posz = posz
    self.plot_kwargs = updated_kwargs

plot(projection_axis, color_cylce=TAB20)

Create Noodle Plot of cell tracks, colored by collective event id.

Parameters:

Name Type Description Default
projection_axis str

Specify with witch coordinate the noodle plot should be drawn. Has to be one of the posx, posy or posz arguments passed in during the class instantiation process.

required
color_cylce list[str]

List of hex color values or string names (i.e. ['red', 'yellow']) used to color collecitve events. Cycles through list.

TAB20

Returns:

Name Type Description
fig Figure

Matplotlib figure object for the noodle plot.

axes Axes

Matplotlib axes for the nooble plot.

Source code in arcos4py/plotting/_plotting.py
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
def plot(self, projection_axis: str, color_cylce: list[str] = TAB20):
    """Create Noodle Plot of cell tracks, colored by collective event id.

    Arguments:
        projection_axis (str): Specify with witch coordinate the noodle
            plot should be drawn. Has to be one of the posx, posy or posz arguments
            passed in during the class instantiation process.
        color_cylce (list[str]): List of hex color values or string names
            (i.e. ['red', 'yellow']) used to color collecitve events. Cycles through list.

    Returns:
        fig (matplotlib.figure.Figure): Matplotlib figure object for the noodle plot.
        axes (matplotlib.axes.Axes): Matplotlib axes for the nooble plot.
    """
    if self.df.empty:
        raise ValueError("Dataframe is empty")
    if projection_axis not in [self.posx, self.posy, self.posz]:
        raise ValueError(f"projection_axis has to be one of {[self.posx, self.posy, self.posz]}")
    if projection_axis == self.posx:
        self.projection_index = 3
    elif projection_axis == self.posy:
        self.projection_index = 4
    elif projection_axis == self.posz:
        self.projection_index = 5
    grpd_data, colors = self._prepare_data_noodleplot(
        self.df,
        color_cylce,
        self.clid_column,
        self.obj_id_column,
        self.frame_column,
        self.posx,
        self.posy,
        self.posz,
    )
    fig, axes = self._create_noodle_plot(grpd_data, colors)
    return fig, axes

dataPlots(data, frame_column='frame', measurement_column='m', obj_id_column='obj_id', **kwargs)

Plot different metrics of input data.

Attributes:

Name Type Description
data Dataframe

containing ARCOS data.

frame_column str

name of frame column in data.

measurement_column str

name of measurement column in data.

obj_id_column str

name of track id column.

Parameters:

Name Type Description Default
data Dataframe

containing ARCOS data.

required
frame_column str

name of frame column in data.

'frame'
measurement_column str

name of measurement column in data.

'm'
obj_id_column str

name of track id column.

'obj_id'
**kwargs Any

Additional keyword arguments. Includes deprecated parameters. - id (str): Deprecated. Use obj_id_column instead. - frame (str): Deprecated. Use frame_column instead. - measurement (str): Deprecated. Use measurement_column instead.

{}
Source code in arcos4py/plotting/_plotting.py
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
def __init__(
    self,
    data: pd.DataFrame,
    frame_column: str = 'frame',
    measurement_column: str = 'm',
    obj_id_column: str = 'obj_id',
    **kwargs,
):
    """Plot different metrics such as histogram, position-t and density.

    Arguments:
        data (Dataframe): containing ARCOS data.
        frame_column (str): name of frame column in data.
        measurement_column (str): name of measurement column in data.
        obj_id_column (str): name of track id column.
        **kwargs (Any): Additional keyword arguments. Includes deprecated parameters.
            - id (str): Deprecated. Use obj_id_column instead.
            - frame (str): Deprecated. Use frame_column instead.
            - measurement (str): Deprecated. Use measurement_column instead.
    """
    map_deprecated_params = {
        "id": "obj_id_column",
        "frame": "frame_column",
        "measurement": "measurement_column",
    }

    # check allowed kwargs
    allowed_kwargs = map_deprecated_params.keys()
    for key in kwargs:
        if key not in allowed_kwargs:
            raise ValueError(f"Got an unexpected keyword argument '{key}'")

    updated_kwargs = handle_deprecated_params(map_deprecated_params, **kwargs)

    # Assigning the parameters
    obj_id_column = updated_kwargs.get("obj_id_column", obj_id_column)
    frame_column = updated_kwargs.get("frame_column", frame_column)
    measurement_column = updated_kwargs.get("measurement_column", measurement_column)

    self.data = data
    self.obj_id = obj_id_column
    self.frame_column = frame_column
    self.measurement_column = measurement_column

density_plot(*args, **kwargs)

Density plot of measurement.

Uses Seaborn distplot to plot measurement density.

Parameters:

Name Type Description Default
*args Any

arguments passed on to seaborn histplot function.

()
**kwargs Any

keyword arguments passed on to seaborn histplot function.

{}

Returns:

Name Type Description
FacetGrid FacetGrid

Seaborn FacetGrid of density density plot.

Source code in arcos4py/plotting/_plotting.py
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
def density_plot(self, *args, **kwargs):
    """Density plot of measurement.

    Uses Seaborn distplot to plot measurement density.

    Arguments:
        *args (Any): arguments passed on to seaborn histplot function.
        **kwargs (Any): keyword arguments passed on to seaborn histplot function.

    Returns:
        FacetGrid (seaborn.FacetGrid): Seaborn FacetGrid of density density plot.
    """
    plot = sns.displot(
        self.data[self.measurement_column],
        kind="kde",
        palette="pastel",
        label=self.measurement_column,
        *args,
        **kwargs,
    )
    # Plot formatting
    plt.legend(prop={'size': 10})
    plt.title('Density Plot of Measurement')
    plt.xlabel('Measurement')
    plt.ylabel('Density')
    return plot

histogram(bins='auto', *args, **kwargs)

Histogram of tracklenght.

Uses seaborn histplot function to plot tracklenght histogram.

Parameters:

Name Type Description Default
bins str

number or width of bins in histogram

'auto'
*args Any

arguments passed on to seaborn histplot function.

()
**kwargs Any

keyword arguments passed on to seaborn histplot function.

{}

Returns:

Name Type Description
AxesSubplot Axes

Matplotlib AxesSubplot of histogram.

Source code in arcos4py/plotting/_plotting.py
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
def histogram(self, bins: str = 'auto', *args, **kwargs) -> plt.Axes:
    """Histogram of tracklenght.

    Uses seaborn histplot function to plot tracklenght histogram.

    Arguments:
        bins (str): number or width of bins in histogram
        *args (Any): arguments passed on to seaborn histplot function.
        **kwargs (Any): keyword arguments passed on to seaborn histplot function.

    Returns:
        AxesSubplot: Matplotlib AxesSubplot of histogram.
    """
    # Draw histogram
    track_length = self.data.groupby(self.obj_id).size()
    axes = sns.histplot(track_length, label="Track Length", bins=bins, *args, **kwargs)
    # Plot formatting
    plt.title('Track length Histogram')
    axes.set_xlabel('Track Length')
    axes.set_ylabel('Count')
    return axes

position_t_plot(position_columns={'x'}, n=20, **kwargs)

Plots X and Y over T to visualize tracklength.

Parameters:

Name Type Description Default
position_columns set

containing names of position columns in data.

{'x'}
n int

number of samples to plot.

20
**kwargs Any

Additional keyword arguments. Includes deprecated parameters. - posCol (set): Deprecated. Use position_columns instead.

{}

Returns:

Name Type Description
fig Figure

Matplotlib figure object of density plot.

axes Axes

Matplotlib axes of density plot.

Source code in arcos4py/plotting/_plotting.py
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
def position_t_plot(self, position_columns: set[str] = {'x'}, n: int = 20, **kwargs) -> Union[plt.Figure, Any]:
    """Plots X and Y over T to visualize tracklength.

    Arguments:
        position_columns (set): containing names of position columns in data.
        n (int): number of samples to plot.
        **kwargs (Any): Additional keyword arguments. Includes deprecated parameters.
            - posCol (set): Deprecated. Use position_columns instead.

    Returns:
        fig (matplotlib.figure.Figure): Matplotlib figure object of density plot.
        axes (matplotlib.axes.Axes): Matplotlib axes of density plot.
    """
    map_deprecated_params = {
        "posCol": "position_columns",
    }

    # check allowed kwargs
    allowed_kwargs = map_deprecated_params.keys()
    for key in kwargs:
        if key not in allowed_kwargs:
            raise ValueError(f"Got an unexpected keyword argument '{key}'")

    updated_kwargs = handle_deprecated_params(map_deprecated_params, **kwargs)

    # Assigning the parameters
    position_columns = updated_kwargs.get("position_columns", position_columns)

    sample = pd.Series(self.data[self.obj_id].unique()).sample(n)
    pd_from_r_df = self.data.loc[self.data[self.obj_id].isin(sample)]
    fig, axes = plt.subplots(1, len(position_columns), figsize=(6, 3))
    for _, df in pd_from_r_df.groupby(self.obj_id):
        for index, value in enumerate(position_columns):
            if len(position_columns) > 1:
                df.plot(x=self.frame_column, y=value, ax=axes[index], legend=None)
            else:
                df.plot(x=self.frame_column, y=value, ax=axes, legend=None)
    if len(position_columns) > 1:
        for index, value in enumerate(position_columns):
            axes[index].set_title(value)
    else:
        axes.set_title(value)
    return fig, axes

plotOriginalDetrended(data, frame_column='frame', measurement_column='m', detrended_column='m_detrended', obj_id_column='obj_id', seed=42, **kwargs)

Plot original and detrended data.

Attributes:

Name Type Description
data DataFrame

containing ARCOS data.

frame_column str

name of frame column in data.

measurement_column str

name of measurement column in data.

detrended_column str

name of detrended column in data.

obj_id_column str

name of track id column.

seed int

seed for random number generator.

Methods:

Name Description
plot_detrended

plot detrended data.

plot_original

plot original data.

plot_original_and_detrended

plot original and detrended data.

Source code in arcos4py/plotting/_plotting.py
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
def __init__(
    self,
    data: pd.DataFrame,
    frame_column: str = "frame",
    measurement_column: str = "m",
    detrended_column: str = "m_detrended",
    obj_id_column: str = "obj_id",
    seed: int = 42,
    **kwargs,
):
    """Constructs class with given parameters."""
    map_deprecated_params = {
        "id": "obj_id_column",
        "frame": "frame_column",
        "detrended": "detrended_column",
        "measurement": "measurement_column",
    }

    # check allowed kwargs
    allowed_kwargs = map_deprecated_params.keys()
    for key in kwargs:
        if key not in allowed_kwargs:
            raise ValueError(f"Got an unexpected keyword argument '{key}'")

    updated_kwargs = handle_deprecated_params(map_deprecated_params, **kwargs)

    # Assigning the parameters
    obj_id_column = updated_kwargs.get("obj_id_column", obj_id_column)
    frame_column = updated_kwargs.get("frame_column", frame_column)
    measurement_column = updated_kwargs.get("measurement_column", measurement_column)

    self.data = data
    self.frame_column = frame_column
    self.measurement_column = measurement_column
    self.detrended_column = detrended_column
    self.obj_id_column = obj_id_column
    self.seed = seed

plot_detrended(n_samples=25, subplots=(5, 5), plotsize=(20, 10), add_binary_segments=False)

Plots detrended data.

Parameters:

Name Type Description Default
n_samples int

number of samples to plot.

25
subplots tuple

number of subplots in x and y direction.

(5, 5)
plotsize tuple

size of the plot.

(20, 10)
add_binary_segments bool

if True, binary segments are added to the plot.

False

Returns:

Name Type Description
fig Figure

Matplotlib figure object of plot.

axes Axes

Matplotlib axes of plot.

Source code in arcos4py/plotting/_plotting.py
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
def plot_detrended(
    self,
    n_samples: int = 25,
    subplots: tuple = (5, 5),
    plotsize: tuple = (20, 10),
    add_binary_segments: bool = False,
) -> tuple[plt.Figure, Any]:
    """Plots detrended data.

    Arguments:
        n_samples (int): number of samples to plot.
        subplots (tuple): number of subplots in x and y direction.
        plotsize (tuple): size of the plot.
        add_binary_segments (bool): if True, binary segments are added to the plot.

    Returns:
        fig (matplotlib.figure.Figure): Matplotlib figure object of plot.
        axes (matplotlib.axes.Axes): Matplotlib axes of plot.
    """
    grouped = self._prepare_data(n_samples)
    return self._plot_data(
        grouped, subplots[0], subplots[1], plotsize, [self.detrended_column], ["detrended"], add_binary_segments
    )

plot_original(n_samples=25, subplots=(5, 5), plotsize=(20, 10), add_binary_segments=False)

Plots original data.

Parameters:

Name Type Description Default
n_samples int

number of samples to plot.

25
subplots tuple

number of subplots in x and y direction.

(5, 5)
plotsize tuple

size of the plot.

(20, 10)
add_binary_segments bool

if True, binary segments are added to the plot.

False

Returns:

Name Type Description
fig Figure

Matplotlib figure object of plot.

axes Axes

Matplotlib axes of plot.

Source code in arcos4py/plotting/_plotting.py
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
def plot_original(
    self,
    n_samples: int = 25,
    subplots: tuple = (5, 5),
    plotsize: tuple = (20, 10),
    add_binary_segments: bool = False,
) -> tuple[plt.Figure, Any]:
    """Plots original data.

    Arguments:
        n_samples (int): number of samples to plot.
        subplots (tuple): number of subplots in x and y direction.
        plotsize (tuple): size of the plot.
        add_binary_segments (bool): if True, binary segments are added to the plot.

    Returns:
        fig (matplotlib.figure.Figure): Matplotlib figure object of plot.
        axes (matplotlib.axes.Axes): Matplotlib axes of plot.
    """
    grouped = self._prepare_data(n_samples)
    return self._plot_data(
        grouped,
        subplots[0],
        subplots[1],
        plotsize,
        [self.measurement_column],
        ["original"],
        add_binary_segments,
    )

plot_original_and_detrended(n_samples=25, subplots=(5, 5), plotsize=(20, 10), add_binary_segments=False)

Plots original and detrended data.

Parameters:

Name Type Description Default
n_samples int

number of samples to plot.

25
subplots tuple

number of subplots in x and y direction.

(5, 5)
plotsize tuple

size of the plot.

(20, 10)
add_binary_segments bool

if True, binary segments are added to the plot.

False

Returns:

Name Type Description
fig Figure

Matplotlib figure object of plot.

axes Axes

Matplotlib axes of plot.

Source code in arcos4py/plotting/_plotting.py
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
def plot_original_and_detrended(
    self,
    n_samples: int = 25,
    subplots: tuple = (5, 5),
    plotsize: tuple = (20, 10),
    add_binary_segments: bool = False,
) -> tuple[plt.Figure, Any]:
    """Plots original and detrended data.

    Arguments:
        n_samples (int): number of samples to plot.
        subplots (tuple): number of subplots in x and y direction.
        plotsize (tuple): size of the plot.
        add_binary_segments (bool): if True, binary segments are added to the plot.

    Returns:
        fig (matplotlib.figure.Figure): Matplotlib figure object of plot.
        axes (matplotlib.axes.Axes): Matplotlib axes of plot.
    """
    grouped = self._prepare_data(n_samples)
    return self._plot_data(
        grouped,
        subplots[0],
        subplots[1],
        plotsize,
        [self.measurement_column, self.detrended_column],
        ["original", "detrended"],
        add_binary_segments,
    )

statsPlots(data)

Plot data generated by the stats module.

Attributes:

Name Type Description
data DataFrame

containing ARCOS stats data.

Parameters:

Name Type Description Default
data DataFrame

containing ARCOS stats data.

required
Source code in arcos4py/plotting/_plotting.py
396
397
398
399
400
401
402
def __init__(self, data: pd.DataFrame):
    """Plot detrended vs original data.

    Arguments:
        data (DataFrame): containing ARCOS stats data.
    """
    self.data = data

plot_events_duration(total_size, duration, point_size=40, *args, **kwargs)

Scatterplot of collective event duration.

Parameters:

Name Type Description Default
total_size str

name of total size column.

required
duration str

, name of column with collective event duration.

required
point_size int

scatterplot point size.

40
*args Any

Arguments passed on to seaborn scatterplot function.

()
**kwargs Any

Keyword arguments passed on to seaborn scatterplot function.

{}

Returns:

Name Type Description
Axes Axes

Matplotlib Axes object of scatterplot

Source code in arcos4py/plotting/_plotting.py
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
def plot_events_duration(self, total_size: str, duration: str, point_size: int = 40, *args, **kwargs) -> plt.Axes:
    """Scatterplot of collective event duration.

    Arguments:
        total_size (str): name of total size column.
        duration (str):, name of column with collective event duration.
        point_size (int): scatterplot point size.
        *args (Any): Arguments passed on to seaborn scatterplot function.
        **kwargs (Any): Keyword arguments passed on to seaborn scatterplot function.

    Returns:
        Axes (matplotlib.axes.Axes): Matplotlib Axes object of scatterplot
    """
    if self.data.empty:
        raise ValueError("Dataframe is empty")
    plot = sns.scatterplot(x=self.data[total_size], y=self.data[duration], s=point_size, *args, **kwargs)
    return plot

tools

Tools for detecting collective events.

DataFrameTracker(linker, position_columns=['x'], frame_column='frame', obj_id_column=None, binarized_measurement_column=None, clid_column='clTrackID', **kwargs)

Bases: BaseTracker

Tracker class for data frames that works in conjunction with the Linker class.

Methods:

Name Description
track_iteration

pd.DataFrame): Tracks events in a single frame.

track

pd.DataFrame) -> Generator: Main method for tracking events through the dataframe. Yields the tracked data frame for each iteration.

Parameters:

Name Type Description Default
linker Linker

The Linker object used for linking events.

required
position_columns list[str]

List of strings representing the coordinate columns.

['x']
frame_column str

String representing the frame/timepoint column in the dataframe.

'frame'
obj_id_column str | None

String representing the ID column, or None if not present. Defaults to None.

None
binarized_measurement_column str | None

String representing the binary measurement column, or None if not present. Defaults to None.

None
clid_column str

String representing the collision track ID column. Defaults to 'clTrackID'.

'clTrackID'
kwargs Any

Additional keyword arguments. Includes deprecated parameters for backwards compatibility. - coordinates_column: Deprecated parameter for position_columns. Use position_columns instead. - collid_column: Deprecated parameter, use clid_column instead. - id_column: Deprecated parameter, use obj_id_column instead. - bin_meas_column: Deprecated parameter, use binarized_measurement_column instead.

{}
Source code in arcos4py/tools/_detect_events.py
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
def __init__(
    self,
    linker: Linker,
    position_columns: list[str] = ['x'],
    frame_column: str = 'frame',
    obj_id_column: str | None = None,
    binarized_measurement_column: str | None = None,
    clid_column: str = 'clTrackID',
    **kwargs,
):
    """Initializes the DataFrameTracker object.

    Arguments:
        linker (Linker): The Linker object used for linking events.
        position_columns (list[str]): List of strings representing the coordinate columns.
        frame_column (str): String representing the frame/timepoint column in the dataframe.
        obj_id_column (str | None): String representing the ID column, or None if not present. Defaults to None.
        binarized_measurement_column (str | None): String representing the binary measurement column, or None if not present.
            Defaults to None.
        clid_column (str): String representing the collision track ID column. Defaults to 'clTrackID'.
        kwargs (Any): Additional keyword arguments. Includes deprecated parameters for backwards compatibility.
            - coordinates_column: Deprecated parameter for position_columns. Use position_columns instead.
            - collid_column: Deprecated parameter, use clid_column instead.
            - id_column: Deprecated parameter, use obj_id_column instead.
            - bin_meas_column: Deprecated parameter, use binarized_measurement_column instead.
    """
    map_deprecated_params = {
        'coordinates_column': 'position_columns',
        'collid_column': 'clid_column',
        'id_column': 'obj_id_column',
        'bin_meas_column': 'binarized_measurement_column',
    }

    # check for allowed kwargs
    for key in kwargs:
        if key not in map_deprecated_params.keys():
            raise ValueError(f'Invalid keyword argument {key}')

    corrected_kwargs = handle_deprecated_params(map_deprecated_params, **kwargs)

    # Assign parameters
    position_columns = corrected_kwargs.get('position_columns', position_columns)
    obj_id_column = corrected_kwargs.get('obj_id_column', obj_id_column)
    binarized_measurement_column = corrected_kwargs.get(
        'binarized_measurement_column', binarized_measurement_column
    )
    clid_column = corrected_kwargs.get('clid_column', clid_column)

    super().__init__(linker)
    self._coordinates_column = position_columns
    self._frame_column = frame_column
    self._id_column = obj_id_column
    self._binarized_measurement_column = binarized_measurement_column
    self._collid_column = clid_column
    self._validate_input(position_columns, frame_column, obj_id_column, binarized_measurement_column, clid_column)

track(x)

Main method for tracking events through the dataframe. Yields the tracked dataframe for each iteration.

Parameters:

Name Type Description Default
x DataFrame

Dataframe to track.

required

Yields:

Name Type Description
Generator Generator

Tracked dataframe.

Source code in arcos4py/tools/_detect_events.py
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
def track(self, x: pd.DataFrame) -> Generator:
    """Main method for tracking events through the dataframe. Yields the tracked dataframe for each iteration.

    Arguments:
        x (pd.DataFrame): Dataframe to track.

    Yields:
        Generator: Tracked dataframe.
    """
    if x.empty:
        raise ValueError('Input is empty')
    x_sorted = self._sort_input(x, frame_column=self._frame_column, object_id_column=self._id_column)

    for t in range(x_sorted[self._frame_column].max() + 1):
        x_frame = x_sorted.query(f'{self._frame_column} == {t}')
        x_tracked = self.track_iteration(x_frame)
        yield x_tracked

track_iteration(x)

Tracks events in a single frame. Returns dataframe with event ids.

Parameters:

Name Type Description Default
x DataFrame

Dataframe to track.

required

Returns:

Type Description
DataFrame

pd.DataFrame: Dataframe with event ids.

Source code in arcos4py/tools/_detect_events.py
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
def track_iteration(self, x: pd.DataFrame) -> pd.DataFrame:
    """Tracks events in a single frame. Returns dataframe with event ids.

    Arguments:
        x (pd.DataFrame): Dataframe to track.

    Returns:
        pd.DataFrame: Dataframe with event ids.
    """
    x_filtered = self._filter_active(x, self._binarized_measurement_column)

    coordinates_data = self._select_necessary_columns(
        x_filtered,
        self._coordinates_column,
    )
    self.linker.link(coordinates_data)

    if self._collid_column in x.columns:
        df_out = x_filtered.drop(columns=[self._collid_column]).copy()
    else:
        df_out = x_filtered.copy()
    event_ids = self.linker.event_ids

    if not event_ids.size:
        df_out[self._collid_column] = 0
        return df_out

    df_out[self._collid_column] = self.linker.event_ids
    return df_out

ImageTracker(linker, downsample=1)

Bases: BaseTracker

Tracker class for image data that works in conjunction with the Linker class.

Methods:

Name Description
track_iteration

np.ndarray): Tracks events in a single frame. Returns the tracked labels.

track

np.ndarray, dims: str = "TXY") -> Generator: Main method for tracking events through the image series. Yields the tracked image for each iteration.

Parameters:

Name Type Description Default
linker Linker

The Linker object used for linking events.

required
downsample int

Downsampling factor for the images. Defaults to 1, meaning no downsampling.

1
Source code in arcos4py/tools/_detect_events.py
965
966
967
968
969
970
971
972
973
def __init__(self, linker: Linker, downsample: int = 1):
    """Initializes the ImageTracker object.

    Arguments:
        linker (Linker): The Linker object used for linking events.
        downsample (int): Downsampling factor for the images. Defaults to 1, meaning no downsampling.
    """
    super().__init__(linker)
    self._downsample = downsample

track(x, dims='TXY')

Method for tracking events through the image series. Yields the tracked image for each iteration.

Parameters:

Name Type Description Default
x ndarray

Image to track.

required
dims str

String of dimensions in order. Default is "TXY". Possible values are "T", "X", "Y", and "Z".

'TXY'

Returns:

Name Type Description
Generator Generator

Generator that yields the tracked image for each iteration.

Source code in arcos4py/tools/_detect_events.py
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
def track(self, x: np.ndarray, dims: str = "TXY") -> Generator:
    """Method for tracking events through the image series. Yields the tracked image for each iteration.

    Arguments:
        x (np.ndarray): Image to track.
        dims (str): String of dimensions in order. Default is "TXY". Possible values are "T", "X", "Y", and "Z".

    Returns:
        Generator: Generator that yields the tracked image for each iteration.
    """
    available_dims = ["T", "X", "Y", "Z"]
    dims_list = list(dims.upper())

    # check input
    for i in dims_list:
        if i not in dims_list:
            raise ValueError(f"Invalid dimension {i}. Must be 'T', 'X', 'Y', or 'Z'.")

    if len(dims_list) > len(set(dims_list)):
        raise ValueError("Duplicate dimensions in dims.")

    if len(dims_list) != x.ndim:
        raise ValueError(
            f"Length of dims must be equal to number of dimensions in image. Image has {x.ndim} dimensions."
        )

    dims_dict = {i: dims_list.index(i) for i in available_dims if i in dims_list}

    # reorder image so T is first dimension
    image_reshaped = np.moveaxis(x, dims_dict["T"], 0)

    for x_frame in image_reshaped:
        x_tracked = self.track_iteration(x_frame)
        yield x_tracked

track_iteration(x)

Tracks events in a single frame. Returns the tracked labels.

Parameters:

Name Type Description Default
x ndarray

Image to track.

required

Returns:

Type Description
ndarray

np.ndarray: Tracked labels.

Source code in arcos4py/tools/_detect_events.py
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
def track_iteration(self, x: np.ndarray) -> np.ndarray:
    """Tracks events in a single frame. Returns the tracked labels.

    Arguments:
        x (np.ndarray): Image to track.

    Returns:
        np.ndarray: Tracked labels.
    """
    x = downscale_image(x, self._downsample)
    coordinates_data, meas_data = self._image_to_coordinates(x)
    coordinates_data_filtered = self._filter_active(coordinates_data, meas_data)

    self.linker.link(coordinates_data_filtered)

    tracked_events = self.linker.event_ids
    out_img = self._coordinates_to_image(x, coordinates_data_filtered, tracked_events)

    if self._downsample > 1:
        out_img = upscale_image(out_img, self._downsample)

    return out_img

Linker(eps=1, eps_prev=None, min_clustersize=1, min_samples=None, clustering_method='dbscan', linking_method='nearest', predictor=True, n_prev=1, cost_threshold=0, reg=1, reg_m=10, n_jobs=1, **kwargs)

Linker class for linking collective events across multiple frames.

Attributes:

Name Type Description
event_ids ndarray

Array to store event IDs, for each coordinate in the current frame.

Methods:

Name Description
link

Links clusters from the previous frame to the current frame.

Parameters:

Name Type Description Default
eps float

The maximum distance between two samples for one to be considered as in the neighbourhood of the other.

1
eps_prev float | None

Frame to frame distance, value is used to connect collective events across multiple frames. If "None", same value as eps is used.

None
min_clustersize int

The minimum size for a cluster to be identified as a collective event.

1
min_samples int | None

The number of samples (or total weight) in a neighbourhood for a point to be considered as a core point. This includes the point itself. Only used if clusteringMethod is 'hdbscan'. If None, minSamples = minClsz.

None
clustering_method str | Callable

The clustering method to be used. One of ['dbscan', 'hdbscan'] or a callable that takes a 2d array of coordinates and returns a list of cluster labels. Arguments eps, minClSz and minSamples are ignored if a callable is passed.

'dbscan'
linking_method str

The linking method to be used.

'nearest'
predictor bool | Callable

The predictor method to be used.

True
n_prev int

Number of previous frames the tracking algorithm looks back to connect collective events.

1
n_jobs int

Number of jobs to run in parallel (only for clustering algorithm).

1
cost_threshold int

Threshold for filtering low-probability matches (only for transportation linking).

0
reg float

Entropy regularization parameter for unbalanced OT algorithm (only for transportation linking).

1
reg_m float

Marginal relaxation parameter for unbalanced OT (only for transportation linking).

10
kwargs Any

Additional keyword arguments. Includes deprecated parameters for backwards compatibility. - epsPrev: Deprecated parameter for eps_prev. Use eps_prev instead. - minClSz: Deprecated parameter for min_clustersize. Use min_clustersize instead. - minSamples: Deprecated parameter for min_samples. Use min_samples instead. - clusteringMethod: Deprecated parameter for clustering_method. Use clustering_method instead. - nPrev: Deprecated parameter for n_prev. Use n_prev instead. - nJobs: Deprecated parameter for n_jobs. Use n_jobs instead.

{}
Source code in arcos4py/tools/_detect_events.py
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
def __init__(
    self,
    eps: float = 1,
    eps_prev: float | None = None,
    min_clustersize: int = 1,
    min_samples: int | None = None,
    clustering_method: str | Callable = "dbscan",
    linking_method: str = "nearest",
    predictor: bool | Callable = True,
    n_prev: int = 1,
    cost_threshold: float = 0,
    reg: float = 1,
    reg_m: float = 10,
    n_jobs: int = 1,
    **kwargs,
):
    """Initializes the Linker object.

    Arguments:
        eps (float): The maximum distance between two samples for one to be considered as in
            the neighbourhood of the other.
        eps_prev (float | None): Frame to frame distance, value is used to connect
            collective events across multiple frames. If "None", same value as eps is used.
        min_clustersize (int): The minimum size for a cluster to be identified as a collective event.
        min_samples (int | None): The number of samples (or total weight) in a neighbourhood for a
            point to be considered as a core point. This includes the point itself.
            Only used if clusteringMethod is 'hdbscan'. If None, minSamples =  minClsz.
        clustering_method (str | Callable): The clustering method to be used. One of ['dbscan', 'hdbscan']
            or a callable that takes a 2d array of coordinates and returns a list of cluster labels.
            Arguments `eps`, `minClSz` and `minSamples` are ignored if a callable is passed.
        linking_method (str): The linking method to be used.
        predictor (bool | Callable): The predictor method to be used.
        n_prev (int): Number of previous frames the tracking
            algorithm looks back to connect collective events.
        n_jobs (int): Number of jobs to run in parallel (only for clustering algorithm).
        cost_threshold (int): Threshold for filtering low-probability matches (only for transportation linking).
        reg (float): Entropy regularization parameter for unbalanced OT algorithm (only for transportation linking).
        reg_m (float): Marginal relaxation parameter for unbalanced OT (only for transportation linking).
        kwargs (Any): Additional keyword arguments. Includes deprecated parameters for backwards compatibility.
            - epsPrev: Deprecated parameter for eps_prev. Use eps_prev instead.
            - minClSz: Deprecated parameter for min_clustersize. Use min_clustersize instead.
            - minSamples: Deprecated parameter for min_samples. Use min_samples instead.
            - clusteringMethod: Deprecated parameter for clustering_method. Use clustering_method instead.
            - nPrev: Deprecated parameter for n_prev. Use n_prev instead.
            - nJobs: Deprecated parameter for n_jobs. Use n_jobs instead.
    """
    map_params = {
        'epsPrev': 'eps_prev',
        'minClSz': 'min_clustersize',
        'minSamples': 'min_samples',
        'clusteringMethod': 'clustering_method',
        'linkingMethod': 'linking_method',
        'nPrev': 'n_prev',
        'nJobs': 'n_jobs',
    }

    # check for allowed kwargs
    for key in kwargs:
        if key not in map_params.keys():
            raise ValueError(f'Invalid keyword argument {key}')

    # Handle deprecated parameters
    kwargs = handle_deprecated_params(map_params, **kwargs)

    # Assign parameters
    eps_prev = kwargs.get('eps_prev', eps_prev)
    min_clustersize = kwargs.get('min_clustersize', min_clustersize)
    min_samples = kwargs.get('min_samples', min_samples)
    clustering_method = kwargs.get('clustering_method', clustering_method)
    n_prev = kwargs.get('n_prev', n_prev)
    n_jobs = kwargs.get('n_jobs', n_jobs)

    self._predictor: Predictor | None  # for mypy
    self._memory = Memory(n_timepoints=n_prev)

    if callable(predictor):
        self._predictor = Predictor(predictor)
    elif predictor:
        self._predictor = Predictor.with_default_predictor()
    else:
        self._predictor = None

    self._nn_tree: KDTree | None = None
    if eps_prev is None:
        self._eps_prev = eps
    else:
        self._eps_prev = eps_prev

    self._reg = reg
    self._reg_m = reg_m
    self._cost_threshold = cost_threshold

    self._n_jobs = n_jobs
    self._validate_input(eps, eps_prev, min_clustersize, min_samples, clustering_method, n_prev, n_jobs)

    self.event_ids = np.empty((0, 0), dtype=np.int64)
    self.max_prev_event_id = 0

    if hasattr(clustering_method, '__call__'):  # check if it's callable
        self.clustering_function = clustering_method
    else:
        if clustering_method == "dbscan":
            self.clustering_function = functools.partial(_dbscan, eps=eps, minClSz=min_clustersize)
        elif clustering_method == "hdbscan":
            self.clustering_function = functools.partial(
                _hdbscan, eps=eps, minClSz=min_clustersize, min_samples=min_samples, cluster_selection_method='eom'
            )
        else:
            raise ValueError(
                f'Clustering method must be either in {AVAILABLE_CLUSTERING_METHODS} or a callable with data as the only argument an argument'  # noqa E501
            )

    if hasattr(linking_method, '__call__'):  # check if it's callable
        self.linking_function = linking_method
    else:
        if linking_method == "nearest":
            self.linking_function = 'brute_force_linking'
        elif linking_method == "transportation":
            self.linking_function = 'transportation_linking'
        else:
            raise ValueError(
                f'Linking method must be either in {AVAILABLE_LINKING_METHODS} or a callable'  # noqa E501
            )

Links clusters from the previous frame to the current frame.

Parameters:

Name Type Description Default
input_coordinates ndarray

The coordinates of the current frame.

required

Returns:

Type Description
None

None, modifies internal state with new linked clusters. New event ids are stored in self.event_ids.

Source code in arcos4py/tools/_detect_events.py
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
def link(self, input_coordinates: np.ndarray) -> None:
    """Links clusters from the previous frame to the current frame.

    Arguments:
        input_coordinates (np.ndarray): The coordinates of the current frame.

    Returns:
        None, modifies internal state with new linked clusters. New event ids are stored in self.event_ids.
    """
    cluster_ids, coordinates, nanrows = self._clustering(input_coordinates)
    # check if first frame
    if not len(self._memory.prev_cluster_ids):
        linked_cluster_ids = self._update_id_empty(cluster_ids)
    # check if anything was detected in current or previous frame
    elif cluster_ids.size == 0 or self._memory.all_cluster_ids.size == 0:
        linked_cluster_ids = self._update_id_empty(cluster_ids)
    else:
        linked_cluster_ids = self._update_id(cluster_ids, coordinates)

    # update memory with current frame and fit predictor if necessary
    self._memory.add_timepoint(new_coordinates=coordinates, new_cluster_ids=linked_cluster_ids)
    if self._predictor is not None and len(self._memory.coordinates) > 1:
        self._predictor.fit(coordinates=self._memory.coordinates, cluster_ids=self._memory.prev_cluster_ids)
    self._memory.remove_timepoint()

    event_ids = np.full_like(nanrows, -1, dtype=np.int64)
    event_ids[~nanrows] = linked_cluster_ids
    self.event_ids = event_ids

binData(smooth_k=3, bias_k=51, peak_threshold=0.2, binarization_threshold=0.1, polynomial_degree=1, bias_method='runmed', n_jobs=1, **kwargs)

Bases: detrender

Smooth, de-trend, and binarise the input data.

First a short-term median filter with size smoothK is applied to remove fast noise from the time series. If the de-trending method is set to "none", smoothing is applied on globally rescaled time series. The subsequent de-trending can be performed with a long-term median filter with the size biasK {biasMet = "runmed"} or by fitting a polynomial of degree polyDeg {biasMet = "lm"}.

After de-trending, if the global difference between min/max is greater than the threshold the signal is rescaled to the (0,1) range. The final signal is binarised using the binThr threshold.

Attributes:

Name Type Description
smoothK int

Size of the short-term median smoothing filter.

biasK int

Size of the long-term de-trending median filter.

peakThr float

Threshold for rescaling of the de-trended signal.

binThr float

Threshold for binarizing the de-trended signal.

polyDeg int

Sets the degree of the polynomial for lm fitting.

biasMet str

De-trending method, one of ['runmed', 'lm', 'none'].

Parameters:

Name Type Description Default
smooth_k int

Size of the short-term median smoothing filter.

3
bias_k int

Size of the long-term de-trending median filter.

51
peak_threshold float

Threshold for rescaling of the de-trended signal.

0.2
binarization_threshold float

Threshold for binarizing the de-trended signal.

0.1
polynomial_degree int

Sets the degree of the polynomial for lm fitting.

1
bias_method str

De-trending method, one of ['runmed', 'lm', 'none'].

'runmed'
n_jobs int

Number of jobs to run in parallel.

1
Source code in arcos4py/tools/_binarize_detrend.py
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
def __init__(
    self,
    smooth_k: int = 3,
    bias_k: int = 51,
    peak_threshold: float = 0.2,
    binarization_threshold: float = 0.1,
    polynomial_degree: int = 1,
    bias_method: str = "runmed",
    n_jobs: int = 1,
    **kwargs,
) -> None:
    """Smooth, de-trend, and binarise the input data.

    Arguments:
        smooth_k (int): Size of the short-term median smoothing filter.
        bias_k (int): Size of the long-term de-trending median filter.
        peak_threshold (float): Threshold for rescaling of the de-trended signal.
        binarization_threshold (float): Threshold for binarizing the de-trended signal.
        polynomial_degree (int): Sets the degree of the polynomial for lm fitting.
        bias_method (str): De-trending method, one of ['runmed', 'lm', 'none'].
        n_jobs (int): Number of jobs to run in parallel.
    """
    super().__init__(smooth_k, bias_k, peak_threshold, polynomial_degree, bias_method, n_jobs, **kwargs)
    self.binarization_threshold = binarization_threshold

run(x, group_column, measurement_column, frame_column, **kwargs)

Runs binarization and detrending.

If the bias_method is 'none', first it rescales the data to between [0,1], then local smoothing is applied to the measurement by groups, followed by binarization.

If bias_method is one of ['lm', 'runmed'], first the data is detrended locally with a median filter and then detrended globally, for 'lm' with a linear model and for 'runmed' with a median filter. Followed by binarization of the data.

Parameters:

Name Type Description Default
x DataFrame

The time-series data for smoothing, detrending and binarization.

required
group_column str | None

Object id column in x. Detrending and rescaling is performed on a per-object basis. If None, no detrending is performed, only rescaling and bias method is ignored.

required
measurement_column str

Measurement column in x on which detrending and rescaling is performed.

required
frame_column str

Frame column in Time-series data. Used for sorting.

required
**kwargs Any

Additional keyword arguments. Includes old parameters for backwards compatibility. - GroupCol (str): Object id column in x. Detrending and rescaling is performed on a per-object basis. - colMeas (str): Measurement column in x on which detrending and rescaling is performed. - colFrame (str): Frame column in Time-series data. Used for sorting.

{}

Returns:

Name Type Description
DataFrame DataFrame

Dataframe containing binarized data, rescaled data and the original columns.

Source code in arcos4py/tools/_binarize_detrend.py
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
def run(
    self, x: pd.DataFrame, group_column: str | None, measurement_column: str, frame_column: str, **kwargs
) -> pd.DataFrame:
    """Runs binarization and detrending.

    If the bias_method is 'none', first it rescales the data to between [0,1], then
    local smoothing is applied to the measurement by groups, followed by
    binarization.

    If bias_method is one of ['lm', 'runmed'], first the data is detrended locally with a
    median filter and then detrended globally, for 'lm' with a linear model and for 'runmed' with a
    median filter.
    Followed by binarization of the data.

    Arguments:
        x (DataFrame): The time-series data for smoothing, detrending and binarization.
        group_column (str | None): Object id column in x. Detrending and rescaling is performed on a per-object basis.
            If None, no detrending is performed, only rescaling and bias method is ignored.
        measurement_column (str): Measurement column in x on which detrending and rescaling is performed.
        frame_column (str): Frame column in Time-series data. Used for sorting.
        **kwargs (Any): Additional keyword arguments. Includes old parameters for backwards compatibility.
            - GroupCol (str): Object id column in x. Detrending and rescaling is performed on a per-object basis.
            - colMeas (str): Measurement column in x on which detrending and rescaling is performed.
            - colFrame (str): Frame column in Time-series data. Used for sorting.

    Returns:
        DataFrame: Dataframe containing binarized data, rescaled data and the original columns.
    """
    # handle deprecated parameters
    param_mapping = {
        "GroupCol": "group_column",
        "colMeas": "measurement_column",
        "colFrame": "frame_column",
    }
    # allowed_kwargs
    allowed_kwargs = param_mapping.keys()
    for key in kwargs:
        if key not in allowed_kwargs:
            raise ValueError(f"Invalid keyword argument: {key}")
    updated_kwargs = handle_deprecated_params(param_mapping, **kwargs)

    # update the parameters
    group_column = updated_kwargs.get("group_column", group_column)
    measurement_column = updated_kwargs.get("measurement_column", measurement_column)
    frame_column = updated_kwargs.get("frame_column", frame_column)

    if group_column is None:
        return self._run_without_groupcol(x, measurement_column, frame_column)
    else:
        return self._run_with_groupcol(x, group_column, measurement_column, frame_column)

calcCollevStats()

Class to calculate statistics of collective events.

Source code in arcos4py/tools/_stats.py
337
338
339
340
341
342
343
def __init__(self) -> None:
    """Initialize the class."""
    warnings.warn(
        "The 'calcCollevStats' class is deprecated and will be removed in a future version. "
        "Please use the standalone functions instead (calculate_statistics).",
        DeprecationWarning,
    )

calculate(data, frame_column, collid_column, obj_id_column, posCol=None)

Calculate summary statistics for collective events based on the entire duration of each event.

Parameters:

Name Type Description Default
data DataFrame

Input data containing information on the collective events.

required
frame_column str

The column name representing the frame numbers.

required
collid_column str

The column name representing the collective event IDs.

required
obj_id_column str

The column name representing the object IDs. Defaults to None.

required
posCol list

List of column names representing the position coordinates. Defaults to None.

None

Returns:

Type Description
DataFrame

pd.DataFrame: A DataFrame containing the summary statistics of the collective events.

Deprecated
Source code in arcos4py/tools/_stats.py
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
def calculate(
    self,
    data: pd.DataFrame,
    frame_column: str,
    collid_column: str,
    obj_id_column: Union[str, None],
    posCol: Union[list, None] = None,
) -> pd.DataFrame:
    """Calculate summary statistics for collective events based on the entire duration of each event.

    Arguments:
        data (pd.DataFrame): Input data containing information on the collective events.
        frame_column (str): The column name representing the frame numbers.
        collid_column (str): The column name representing the collective event IDs.
        obj_id_column (str, optional): The column name representing the object IDs. Defaults to None.
        posCol (list, optional): List of column names representing the position coordinates. Defaults to None.

    Returns:
        pd.DataFrame: A DataFrame containing the summary statistics of the collective events.

    Deprecated:
        calculate: Use calculate_statistics instead.
    """
    warnings.warn(
        "The 'calculate' method is deprecated and will be removed in a future version. "
        "Please use the 'calculate_statistics' function instead.",
        DeprecationWarning,
    )
    return calculate_statistics(data, frame_column, collid_column, obj_id_column, posCol)

clipMeas(data)

Clip input array.

Parameters:

Name Type Description Default
data ndarray

To be clipped.

required
Source code in arcos4py/tools/_cleandata.py
54
55
56
57
58
59
60
def __init__(self, data: np.ndarray) -> None:
    """Clips array to quantilles.

    Arguments:
        data (ndarray): To be clipped.
    """
    self.data = data

clip(clip_low=0.001, clip_high=0.999)

Clip input array to upper and lower quantiles defined in clip_low and clip_high.

Parameters:

Name Type Description Default
clip_low float

Lower clipping boundary (quantile).

0.001
clip_high float

Upper clipping boundry (quantille).

0.999

Returns:

Type Description
ndarray

np.ndarray (np.ndarray): A clipped array of the input data.

Source code in arcos4py/tools/_cleandata.py
80
81
82
83
84
85
86
87
88
89
90
91
92
def clip(self, clip_low: float = 0.001, clip_high: float = 0.999) -> np.ndarray:
    """Clip input array to upper and lower quantiles defined in clip_low and clip_high.

    Arguments:
        clip_low (float): Lower clipping boundary (quantile).
        clip_high (float): Upper clipping boundry (quantille).

    Returns:
        np.ndarray (np.ndarray): A clipped array of the input data.
    """
    low, high = self._calculate_percentile(self.data, clip_low, clip_high)
    out = self.data.clip(low, high)
    return out

detectCollev(input_data, eps=1, epsPrev=None, minClSz=1, nPrev=1, posCols=['x'], frame_column='time', id_column=None, bin_meas_column='meas', clid_column='clTrackID', dims='TXY', method='dbscan', min_samples=None, linkingMethod='nearest', n_jobs=1, predictor=False, show_progress=True)

Class to detect collective events.

Attributes:

Name Type Description
input_data Union[DataFrame, ndarray]

The input data to track.

eps float

Maximum distance for clustering, default is 1.

epsPrev Union[float, None]

Maximum distance for linking previous clusters, if None, eps is used. Default is None.

minClSz int

Minimum cluster size. Default is 3.

nPrev int

Number of previous frames to consider. Default is 1.

posCols list

List of column names for the position columns. Default is ["x"].

frame_column str

Name of the column containing the frame number. Default is 'time'.

id_column Union[str, None]

Name of the column containing the id. Default is None.

bin_meas_column Union[str, None]

Name of the column containing the binary measurement. Default is 'meas'.

clid_column str

Name of the column containing the cluster id. Default is 'clTrackID'.

dims str

String of dimensions in order, such as. Default is "TXY". Possible values are "T", "X", "Y", "Z".

method str

The method used for clustering, one of [dbscan, hdbscan]. Default is "dbscan".

min_samples int | None

The number of samples (or total weight) in a neighbourhood for a point to be considered as a core point. This includes the point itself. Only used if clusteringMethod is 'hdbscan'. If None, minSamples = minClsz.

linkingMethod str

The method used for linking. Default is 'nearest'.

n_jobs int

Number of jobs to run in parallel. Default is 1.

predictor bool | Callable

Whether or not to use a predictor. Default is False. True uses the default predictor. A callable can be passed to use a custom predictor. See default predictor method for details.

show_progress bool

Whether or not to show progress bar. Default is True.

Parameters:

Name Type Description Default
input_data DataFrame

Input data to be processed. Must contain a binarized measurement column.

required
eps float

The maximum distance between two samples for one to be considered as in the neighbourhood of the other. This is not a maximum bound on the distances of points within a cluster.

1
epsPrev float | None

Frame to frame distance, value is used to connect collective events across multiple frames.If "None", same value as eps is used.

None
minClSz int

Minimum size for a cluster to be identified as a collective event.

1
nPrev int

Number of previous frames the tracking algorithm looks back to connect collective events.

1
posCols list

List of position columns contained in the data. Must at least contain one.

['x']
frame_column str

Indicating the frame column in input_data.

'time'
id_column str | None

Indicating the track id/id column in input_data, optional.

None
bin_meas_column str

Indicating the bin_meas_column in input_data or None.

'meas'
clid_column str

Indicating the column name containing the ids of collective events.

'clTrackID'
dims str

String of dimensions in order, used if input_data is a numpy array. Default is "TXY". Possible values are "T", "X", "Y", "Z".

'TXY'
method str

The method used for clustering, one of [dbscan, hdbscan]. Default is "dbscan".

'dbscan'
min_samples int | None

The number of samples (or total weight) in a neighbourhood for a point to be considered as a core point. This includes the point itself. Only used if clusteringMethod is 'hdbscan'. If None, minSamples = minClsz.

None
linkingMethod str

The method used for linking. Default is 'nearest'.

'nearest'
n_jobs int

Number of paralell workers to spawn, -1 uses all available cpus.

1
predictor bool | Callable

Whether or not to use a predictor. Default is False. True uses the default predictor. A callable can be passed to use a custom predictor. See default predictor method for details.

False
show_progress bool

Whether or not to show progress bar. Default is True.

True
Source code in arcos4py/tools/_detect_events.py
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
def __init__(
    self,
    input_data: Union[pd.DataFrame, np.ndarray],
    eps: float = 1,
    epsPrev: Union[float, None] = None,
    minClSz: int = 1,
    nPrev: int = 1,
    posCols: list = ["x"],
    frame_column: str = 'time',
    id_column: Union[str, None] = None,
    bin_meas_column: Union[str, None] = 'meas',
    clid_column: str = 'clTrackID',
    dims: str = "TXY",
    method: str = "dbscan",
    min_samples: int | None = None,
    linkingMethod='nearest',
    n_jobs: int = 1,
    predictor: bool | Callable = False,
    show_progress: bool = True,
) -> None:
    """Constructs class with input parameters.

    Arguments:
        input_data (DataFrame): Input data to be processed. Must contain a binarized measurement column.
        eps (float): The maximum distance between two samples for one to be considered as in
            the neighbourhood of the other.
            This is not a maximum bound on the distances of points within a cluster.
        epsPrev (float | None): Frame to frame distance, value is used to connect
            collective events across multiple frames.If "None", same value as eps is used.
        minClSz (int): Minimum size for a cluster to be identified as a collective event.
        nPrev (int): Number of previous frames the tracking
            algorithm looks back to connect collective events.
        posCols (list): List of position columns contained in the data.
            Must at least contain one.
        frame_column (str): Indicating the frame column in input_data.
        id_column (str | None): Indicating the track id/id column in input_data, optional.
        bin_meas_column (str): Indicating the bin_meas_column in input_data or None.
        clid_column (str): Indicating the column name containing the ids of collective events.
        dims (str): String of dimensions in order, used if input_data is a numpy array. Default is "TXY".
            Possible values are "T", "X", "Y", "Z".
        method (str): The method used for clustering, one of [dbscan, hdbscan]. Default is "dbscan".
        min_samples (int | None): The number of samples (or total weight) in a neighbourhood for a
            point to be considered as a core point. This includes the point itself.
            Only used if clusteringMethod is 'hdbscan'. If None, minSamples =  minClsz.
        linkingMethod (str): The method used for linking. Default is 'nearest'.
        n_jobs (int): Number of paralell workers to spawn, -1 uses all available cpus.
        predictor (bool | Callable): Whether or not to use a predictor. Default is False.
            True uses the default predictor. A callable can be passed to use a custom predictor.
            See default predictor method for details.
        show_progress (bool): Whether or not to show progress bar. Default is True.
    """
    self.input_data = input_data
    self.eps = eps
    self.epsPrev = epsPrev
    self.minClSz = minClSz
    self.nPrev = nPrev
    self.posCols = posCols
    self.frame_column = frame_column
    self.id_column = id_column
    self.bin_meas_column = bin_meas_column
    self.clid_column = clid_column
    self.dims = dims
    self.method = method
    self.linkingMethod = linkingMethod
    self.min_samples = min_samples
    self.predictor = predictor
    self.n_jobs = n_jobs
    self.show_progress = show_progress
    warnings.warn(
        "This class is deprecated and will be removed a future release, use the track_events_dataframe or track_events_image functions directly.",  # noqa: E501
        DeprecationWarning,
    )

run(copy=True)

Runs the collective event detection algorithm.

Parameters:

Name Type Description Default
copy bool

Whether or not to copy the input data. Default is True.

True

Returns:

Name Type Description
DataFrame DataFrame

Input data with added collective event ids.

Source code in arcos4py/tools/_detect_events.py
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
def run(self, copy: bool = True) -> pd.DataFrame:
    """Runs the collective event detection algorithm.

    Arguments:
        copy (bool): Whether or not to copy the input data. Default is True.

    Returns:
        DataFrame: Input data with added collective event ids.
    """
    if isinstance(self.input_data, pd.DataFrame):
        if copy:
            self.input_data = self.input_data.copy()
        return track_events_dataframe(
            X=self.input_data,
            position_columns=self.posCols,
            frame_column=self.frame_column,
            id_column=self.id_column,
            binarized_measurement_column=self.bin_meas_column,
            clid_column=self.clid_column,
            eps=self.eps,
            eps_prev=self.epsPrev,
            min_clustersize=self.minClSz,
            min_samples=self.min_samples,
            clustering_method=self.method,
            linking_method=self.linkingMethod,
            n_prev=self.nPrev,
            predictor=self.predictor,
            n_jobs=self.n_jobs,
            show_progress=self.show_progress,
        )
    elif isinstance(self.input_data, np.ndarray):
        if copy:
            self.input_data = np.copy(self.input_data)
        return track_events_image(
            X=self.input_data,
            eps=self.eps,
            eps_prev=self.epsPrev,
            min_clustersize=self.minClSz,
            min_samples=self.min_samples,
            clustering_method=self.method,
            n_prev=self.nPrev,
            predictor=self.predictor,
            linking_method=self.linkingMethod,
            dims=self.dims,
            n_jobs=self.n_jobs,
            show_progress=self.show_progress,
        )

filterCollev(data, frame_column='time', clid_column='collid', obj_id_column='trackID', **kwargs)

Select Collective events that last longer than coll_duration and have a larger total size than coll_total_size.

Attributes:

Name Type Description
data Dataframe

With detected collective events.

frame_column str

Indicating the frame column in data.

collid_column str

Indicating the collective event id column in data.

obj_id_column str

Inidicating the object identifier column such as cell track id.

Parameters:

Name Type Description Default
data Dataframe

With detected collective events.

required
frame_column str

Indicating the frame column in data.

'time'
clid_column str

Indicating the collective event id column in data.

'collid'
obj_id_column str

Inidicating the object identifier column such as cell track id.

'trackID'
**kwargs Any

Additional keyword arguments. Includes deprecated parameters. - collid_column (str): Deprecated. Use clid_column instead.

{}
Source code in arcos4py/tools/_filter_events.py
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
def __init__(
    self,
    data: pd.DataFrame,
    frame_column: str = "time",
    clid_column: str = "collid",
    obj_id_column: str = "trackID",
    **kwargs,
):
    """Constructs filterCollev class with Parameters.

    Arguments:
        data (Dataframe): With detected collective events.
        frame_column (str): Indicating the frame column in data.
        clid_column (str): Indicating the collective event id column in data.
        obj_id_column (str): Inidicating the object identifier column such as cell track id.
        **kwargs (Any): Additional keyword arguments. Includes deprecated parameters.
            - collid_column (str): Deprecated. Use clid_column instead.
    """
    map_deprecated_params = {
        "collid_column": "clid_column",
    }

    # check allowed kwargs
    allowed_kwargs = map_deprecated_params.keys()
    for key in kwargs:
        if key not in allowed_kwargs:
            raise ValueError(f"Got an unexpected keyword argument '{key}'")

    updated_kwargs = handle_deprecated_params(map_deprecated_params, **kwargs)

    # Assigning the parameters
    clid_column = updated_kwargs.get("clid_column", clid_column)

    self.data = data
    self.frame_column = frame_column
    self.clid_column = clid_column
    self.obj_id_column = obj_id_column

filter(min_duration=9, min_total_size=10, **kwargs)

Filter collective events.

Method to filter collective events according to the parameters specified in the object instance.

Parameters:

Name Type Description Default
min_duration int

Minimal duration of collective events to be selected.

9
min_total_size int

Minimal total size of collective events to be selected.

10
**kwargs Any

Additional keyword arguments. Includes deprecated parameters. - coll_duration (int): Deprecated. Use min_duration instead. - coll_total_size (int): Deprecated. Use min_total_size instead.

{}

Returns:

Type Description
DataFrame

Returns pandas dataframe containing filtered collective events

Source code in arcos4py/tools/_filter_events.py
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
def filter(self, min_duration: int = 9, min_total_size: int = 10, **kwargs) -> pd.DataFrame:
    """Filter collective events.

    Method to filter collective events according to the
    parameters specified in the object instance.

    Arguments:
        min_duration (int): Minimal duration of collective events to be selected.
        min_total_size (int): Minimal total size of collective events to be selected.
        **kwargs (Any): Additional keyword arguments. Includes deprecated parameters.
            - coll_duration (int): Deprecated. Use min_duration instead.
            - coll_total_size (int): Deprecated. Use min_total_size instead.

    Returns:
         Returns pandas dataframe containing filtered collective events
    """
    map_deprecated_params = {
        "coll_duration": "min_duration",
        "coll_total_size": "min_total_size",
    }

    # check allowed kwargs
    allowed_kwargs = map_deprecated_params.keys()
    for key in kwargs:
        if key not in allowed_kwargs:
            raise ValueError(f"Got an unexpected keyword argument '{key}'")

    updated_kwargs = handle_deprecated_params(map_deprecated_params, **kwargs)

    # Assigning the parameters
    min_duration = updated_kwargs.get("min_duration", min_duration)
    min_total_size = updated_kwargs.get("min_total_size", min_total_size)

    if self.data.empty:
        return self.data
    stats = calcCollevStats()
    stats_df = stats.calculate(self.data, self.frame_column, self.clid_column, self.obj_id_column)

    filtered_df = self._filter_collev(
        data=self.data,
        clid_stats=stats_df,
        clid_column=self.clid_column,
        min_duration=min_duration,
        min_total_size=min_total_size,
    )
    return filtered_df

interpolation(data)

Interpolate nan values in a numpy array.

Attributes:

Name Type Description
data DataFrame

Where NaN should be replaced with interpolated values.

Uses pandas.interpolate with liner interpolation.

Parameters:

Name Type Description Default
data DataFrame

Where NaN should be replaced with interpolated values.

required
Source code in arcos4py/tools/_cleandata.py
30
31
32
33
34
35
36
37
38
def __init__(self, data: pd.DataFrame):
    """Interpolate nan values in a pandas dataframe.

    Uses pandas.interpolate with liner interpolation.

    Arguments:
        data (DataFrame): Where NaN should be replaced with interpolated values.
    """
    self.data = data

interpolate()

Interpolate nan and missing values.

Returns:

Name Type Description
DataFrame DataFrame

Interpolated input data.

Source code in arcos4py/tools/_cleandata.py
40
41
42
43
44
45
46
47
48
def interpolate(self) -> pd.DataFrame:
    """Interpolate nan and missing values.

    Returns:
        DataFrame: Interpolated input data.
    """
    self.data = self.data.interpolate(axis=0)

    return self.data

calculate_statistics(data, frame_column='frame', clid_column='collid', obj_id_column=None, position_columns=None, **kwargs)

Calculate summary statistics for collective events based on the entire duration of each event.

Parameters:

Name Type Description Default
data DataFrame

Input data containing information on the collective events.

required
frame_column str

The column name representing the frame numbers.

'frame'
clid_column str

The column name representing the collective event IDs.

'collid'
obj_id_column str

The column name representing the object IDs. Defaults to None.

None
position_columns List[str]

List of column names representing the position coordinates. Defaults to None.

None
**kwargs Any

Additional keyword arguments. Includes deprecated parameters. - collid_column (str): Deprecated. Use clid_column instead. - pos_columns (List[str], optional): Deprecated. Use position_columns instead.

{}

Returns:

Type Description
DataFrame

pd.DataFrame: A DataFrame containing the summary statistics of the collective events.

Statistics Calculated
  • collid: The unique ID representing each collective event.
  • duration: The duration of each event, calculated as the difference between the maximum and minimum frame values plus one.
  • first_timepoint, last_timepoint: The first and last frames in which each event occurs.
  • total_size: The total number of unique objects involved in each event (calculated if obj_id_column is provided).
  • min_size, max_size: The minimum and maximum size of each event, defined as the number of objects in the event's smallest and largest frames, respectively.
  • first_frame_centroid_x, first_frame_centroid_y, last_frame_centroid_x, last_frame_centroid_y: The x and y coordinates of the centroid of all objects in the first and last frames of each event (calculated if posCol is provided).
  • centroid_speed: The speed of the centroid, calculated as the distance between the first and last frame centroids divided by the duration (calculated if posCol is provided).
  • direction: The direction of motion of the centroid, calculated as the arctangent of the change in y divided the change in x (calculated if posCol is provided).
  • first_frame_spatial_extent, last_frame_spatial_extent: The maximum distance between any pair of objects in the first and last frames (calculated if posCol is provided).
  • first_frame_convex_hull_area, last_frame_convex_hull_area: The areas of the convex hulls enclosing all objects in the first and last frames (calculated if posCol is provided).
  • size_variability: The standard deviation of the event size over all frames, providing a measure of the variability in the size of the event over time (calculated if obj_id_column is provided).
Source code in arcos4py/tools/_stats.py
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
def calculate_statistics(
    data: pd.DataFrame,
    frame_column: str = "frame",
    clid_column: str = "collid",
    obj_id_column: Union[str, None] = None,
    position_columns: Union[List[str], None] = None,
    **kwargs,
) -> pd.DataFrame:
    """Calculate summary statistics for collective events based on the entire duration of each event.

    Arguments:
        data (pd.DataFrame): Input data containing information on the collective events.
        frame_column (str): The column name representing the frame numbers.
        clid_column (str): The column name representing the collective event IDs.
        obj_id_column (str, optional): The column name representing the object IDs. Defaults to None.
        position_columns (List[str], optional): List of column names representing the position coordinates. Defaults to None.
        **kwargs (Any): Additional keyword arguments. Includes deprecated parameters.
            - collid_column (str): Deprecated. Use clid_column instead.
            - pos_columns (List[str], optional): Deprecated. Use position_columns instead.

    Returns:
        pd.DataFrame: A DataFrame containing the summary statistics of the collective events.

    Statistics Calculated:
        - collid: The unique ID representing each collective event.
        - duration: The duration of each event, calculated as the difference between the maximum
            and minimum frame values plus one.
        - first_timepoint, last_timepoint: The first and last frames in which each event occurs.
        - total_size: The total number of unique objects involved in each event
            (calculated if obj_id_column is provided).
        - min_size, max_size: The minimum and maximum size of each event,
            defined as the number of objects in the event's smallest and largest frames, respectively.
        - first_frame_centroid_x, first_frame_centroid_y, last_frame_centroid_x, last_frame_centroid_y:
            The x and y coordinates of the centroid of all objects in the first and last frames of each event
            (calculated if posCol is provided).
        - centroid_speed: The speed of the centroid, calculated as the distance between
            the first and last frame centroids divided by the duration (calculated if posCol is provided).
        - direction: The direction of motion of the centroid, calculated as the arctangent of the change in y divided
            the change in x (calculated if posCol is provided).
        - first_frame_spatial_extent, last_frame_spatial_extent: The maximum distance between any pair of objects in the
        first and last frames (calculated if posCol is provided).
        - first_frame_convex_hull_area, last_frame_convex_hull_area: The areas of the convex hulls enclosing all objects
            in the first and last frames (calculated if posCol is provided).
        - size_variability: The standard deviation of the event size over all frames, providing a measure of the
            variability in the size of the event over time (calculated if obj_id_column is provided).
    """
    map_deprecated_params = {
        "collid_column": "clid_column",
        "pos_columns": "position_columns",
    }

    # check allowed kwargs
    allowed_kwargs = map_deprecated_params.keys()
    for key in kwargs:
        if key not in allowed_kwargs:
            raise ValueError(f"Got an unexpected keyword argument '{key}'")

    updated_kwargs = handle_deprecated_params(map_deprecated_params, **kwargs)

    # Assigning the parameters
    clid_column = updated_kwargs.get("clid_column", clid_column)
    position_columns = updated_kwargs.get("position_columns", position_columns)

    # Error handling: Check if necessary columns are present in the input data
    if data.empty:
        raise ValueError("The input data is empty.")
    necessary_columns = [frame_column, clid_column]
    if obj_id_column:
        necessary_columns.append(obj_id_column)
    if position_columns:
        necessary_columns.extend(position_columns)

    for col in necessary_columns:
        if col not in data.columns and col is not None:
            raise ValueError(f"The column '{col}' is not present in the input data.")

    collid_groups = data.groupby(clid_column)

    # Initialize an empty list to store the statistics
    stats_list = []

    for collid, group_data in collid_groups:

        collid_stats = {clid_column: collid}

        # Grouping by collid_column to get initial statistics
        duration = group_data[frame_column].max() - group_data[frame_column].min() + 1
        collid_stats['duration'] = duration
        collid_stats['first_timepoint'] = group_data[frame_column].min()
        collid_stats['last_timepoint'] = group_data[frame_column].max()

        # If obj_id_column is provided, calculate size related stats
        if obj_id_column:
            total_size = group_data[obj_id_column].nunique()

            collid_stats['total_size'] = total_size

        # calculate min and max size based on the number of objects in each frame
        frame_size_stats = group_data.groupby(frame_column).size()
        collid_stats['min_size'] = frame_size_stats.min()
        collid_stats['max_size'] = frame_size_stats.max()

        # If posCol is provided, calculate centroid coordinates for the
        if position_columns:
            tp_1 = collid_stats['first_timepoint']
            tp_2 = collid_stats['last_timepoint']

            centroid_data = group_data.groupby(frame_column)[position_columns].mean().reset_index()

            for col in position_columns:
                collid_stats[f'first_frame_centroid_{col}'] = centroid_data.query(f'{frame_column} == {tp_1}')[
                    col
                ].to_numpy()[0]
                collid_stats[f'last_frame_centroid_{col}'] = centroid_data.query(f'{frame_column} == {tp_2}')[
                    col
                ].to_numpy()[0]

            # Calculate speed and direction
            speed = np.linalg.norm(
                np.column_stack([collid_stats[f'first_frame_centroid_{col}'] for col in position_columns])
                - np.column_stack([collid_stats[f'last_frame_centroid_{col}'] for col in position_columns]),
                axis=1,
            ) / (collid_stats['duration'] - 1)

            collid_stats['centroid_speed'] = speed[0]

            # Direction For 2D data
            if len(position_columns) == 2:
                collid_stats['direction'] = np.arctan2(
                    collid_stats[f'last_frame_centroid_{position_columns[1]}']
                    - collid_stats[f'first_frame_centroid_{position_columns[1]}'],
                    collid_stats[f'last_frame_centroid_{position_columns[0]}']
                    - collid_stats[f'first_frame_centroid_{position_columns[0]}'],
                )
            # Direction For 3D data
            elif len(position_columns) == 3:
                dx = (
                    collid_stats[f'last_frame_centroid_{position_columns[0]}']
                    - collid_stats[f'first_frame_centroid_{position_columns[0]}']
                )
                dy = (
                    collid_stats[f'last_frame_centroid_{position_columns[1]}']
                    - collid_stats[f'first_frame_centroid_{position_columns[1]}']
                )
                dz = (
                    collid_stats[f'last_frame_centroid_{position_columns[2]}']
                    - collid_stats[f'first_frame_centroid_{position_columns[2]}']
                )

                # Calculate azimuth and elevation
                collid_stats['azimuth'] = np.arctan2(dy, dx)
                collid_stats['elevation'] = np.arctan2(dz, np.sqrt(dx**2 + dy**2))
            else:
                raise ValueError("Position columns can only be 2 or 3.")

            # Loop over first and last frames separately to calculate the spatial extent and convex hull area
            for frame_name, frame_number in zip(['first_frame', 'last_frame'], [tp_1, tp_2]):
                # Get data for either the first or last frame
                frame_data = group_data.query(f'{frame_column} == {frame_number}')

                # Calculate spatial extent
                spatial_extent = pdist(frame_data[position_columns].values).max() if len(frame_data) > 1 else 0
                collid_stats[f'{frame_name}_spatial_extent'] = spatial_extent

                # Calculate convex hull area
                try:
                    convex_hull_area = (
                        ConvexHull(frame_data[position_columns].values).volume
                        if len(frame_data) > len(position_columns)
                        else 0
                    )
                except QhullError:
                    convex_hull_area = 0
                collid_stats[f'{frame_name}_convex_hull_area'] = convex_hull_area

        stats_list.append(collid_stats)

    # Create a DataFrame from the list of statistics
    stats_df = pd.DataFrame(stats_list)

    # Calculate size variability
    if obj_id_column:
        # Calculating size for each collid and frame
        frame_size_stats = data.groupby([clid_column, frame_column])[obj_id_column].nunique().reset_index(name='size')
        size_variability = frame_size_stats.groupby(clid_column)['size'].std().reset_index(name='size_variability')
        stats_df = stats_df.merge(size_variability, on=clid_column, how='left')

    return stats_df

calculate_statistics_per_frame(data, frame_column='frame', clid_column='collid', position_columns=None, **kwargs)

Calculate summary statistics for collective events based on the entire duration of each event.

Parameters:

Name Type Description Default
data DataFrame

Input data containing information on the collective events.

required
frame_column str

The column name representing the frame numbers.

'frame'
clid_column str

The column name representing the collective event IDs.

'collid'
position_columns List[str]

List of column names representing the position coordinates. Defaults to None.

None
**kwargs Any

Additional keyword arguments. Includes deprecated parameters. - collid_column (str): Deprecated. Use clid_column instead. - pos_columns (List[str], optional): Deprecated. Use position_columns instead.

{}

Returns:

Type Description
DataFrame

pd.DataFrame: A DataFrame containing the summary statistics of the collective events.

Statistics Calculated
  • collid: The unique ID representing each collective event.
  • frame: The frame number.
  • size: The number of objects in the collective event
  • centroid_x, centroid_y: The x and y coordinates of the centroid of all objects in the collective event (calculated if pos_columns is provided).
  • spatial_extent: The maximum distance between any pair of objects in the collective event (calculated if pos_columns is provided).
  • convex_hull_area: The area of the convex hull enclosing all objects in the collective event (calculated if pos_columns is provided).
  • direction: The direction of motion of the centroid, calculated as the arctangent of the change in y divided the change in x (calculated if pos_columns is provided).
  • centroid_speed: The speed of the centroid, calculated as the norm of the change in x and y divided by the duration (calculated if pos_columns is provided).
Source code in arcos4py/tools/_stats.py
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
def calculate_statistics_per_frame(
    data: pd.DataFrame,
    frame_column: str = "frame",
    clid_column: str = "collid",
    position_columns: Union[List[str], None] = None,
    **kwargs,
) -> pd.DataFrame:
    """Calculate summary statistics for collective events based on the entire duration of each event.

    Arguments:
        data (pd.DataFrame): Input data containing information on the collective events.
        frame_column (str): The column name representing the frame numbers.
        clid_column (str): The column name representing the collective event IDs.
        position_columns (List[str], optional): List of column names representing the position coordinates. Defaults to None.
        **kwargs (Any): Additional keyword arguments. Includes deprecated parameters.
            - collid_column (str): Deprecated. Use clid_column instead.
            - pos_columns (List[str], optional): Deprecated. Use position_columns instead.


    Returns:
        pd.DataFrame: A DataFrame containing the summary statistics of the collective events.

    Statistics Calculated:
        - collid: The unique ID representing each collective event.
        - frame: The frame number.
        - size: The number of objects in the collective event
        - centroid_x, centroid_y: The x and y coordinates of the centroid of all objects in the collective event
            (calculated if pos_columns is provided).
        - spatial_extent: The maximum distance between any pair of objects in the collective event
            (calculated if pos_columns is provided).
        - convex_hull_area: The area of the convex hull enclosing all objects in the collective event
            (calculated if pos_columns is provided).
        - direction: The direction of motion of the centroid, calculated as the arctangent of the change in y divided
            the change in x (calculated if pos_columns is provided).
        - centroid_speed: The speed of the centroid, calculated as the norm of the change
            in x and y divided by the duration (calculated if pos_columns is provided).
    """
    map_deprecated_params = {
        "collid_column": "clid_column",
        "pos_columns": "position_columns",
    }

    # check allowed kwargs
    allowed_kwargs = map_deprecated_params.keys()
    for key in kwargs:
        if key not in allowed_kwargs:
            raise ValueError(f"Got an unexpected keyword argument '{key}'")

    updated_kwargs = handle_deprecated_params(map_deprecated_params, **kwargs)

    # Assigning the parameters
    clid_column = updated_kwargs.get("clid_column", clid_column)
    position_columns = updated_kwargs.get("position_columns", position_columns)

    if data.empty:
        raise ValueError("The input data is empty.")
    necessary_columns = [frame_column, clid_column]
    if position_columns:
        necessary_columns.extend(position_columns)

    for col in necessary_columns:
        if col not in data.columns and col is not None:
            raise ValueError(f"The column '{col}' is not present in the input data.")

    data = data.rename(columns={frame_column: frame_column, clid_column: clid_column})
    collid_groups = data.groupby([frame_column, clid_column])
    stats_list = []

    for (frame, collid), group_data in collid_groups:

        frame_stats = {clid_column: collid, frame_column: frame}

        frame_stats['size'] = group_data.count()[frame_column]

        # If pos_columns are provided, calculate spatial statistics for this frame
        if position_columns:
            # Calculate centroid
            centroid = group_data[position_columns].mean().to_dict()
            for pos_col, cent_val in centroid.items():
                frame_stats[f'centroid_{pos_col}'] = cent_val

            # Calculate spatial extent
            spatial_extent = pdist(group_data[position_columns].values).max() if len(group_data) > 1 else 0
            frame_stats['spatial_extent'] = spatial_extent

            # Calculate convex hull area
            try:
                convex_hull_area = (
                    ConvexHull(group_data[position_columns].values).volume
                    if len(group_data) > len(position_columns)
                    else 0
                )
            except QhullError:
                convex_hull_area = 0
            frame_stats['convex_hull_area'] = convex_hull_area

        stats_list.append(frame_stats)

    # Create a DataFrame from the list of statistics
    stats_df = pd.DataFrame(stats_list)

    # If pos_columns are provided, we can calculate speed and direction by looking at changes between frames
    if position_columns:
        stats_df.sort_values(by=[clid_column, frame_column], inplace=True)

        for i, col in enumerate(position_columns):
            stats_df[f'delta_{col}'] = stats_df.groupby(clid_column)[f'centroid_{col}'].diff()

        # Calculate speed (the norm of the delta vector)
        stats_df['centroid_speed'] = np.linalg.norm(
            stats_df[[f'delta_{col}' for col in position_columns]].values, axis=1
        )

        # Calculate direction (only for 2D)
        if len(position_columns) == 2:
            stats_df['direction'] = np.arctan2(
                stats_df['delta_' + position_columns[1]], stats_df['delta_' + position_columns[0]]
            )

        # Clean up temporary delta columns
        stats_df.drop(columns=[f'delta_{col}' for col in position_columns], inplace=True)

    return stats_df

estimate_eps(data, method='kneepoint', position_columns=['x,y'], frame_column='t', n_neighbors=5, plot=True, plt_size=(5, 5), max_samples=50000, **kwargs)

Estimates eps parameter in DBSCAN.

Estimates the eps parameter for the DBSCAN clustering method, as used by ARCOS, by calculating the nearest neighbour distances for each point in the data. N_neighbours should be chosen to match the minimum point size in DBSCAN or the minimum clustersize in detect_events respectively. The method argument determines how the eps parameter is estimated. 'kneepoint' estimates the knee of the nearest neighbour distribution. 'mean' and 'median' return (by default) 1.5 times the mean or median of the nearest neighbour distances respectively.

Parameters:

Name Type Description Default
data DataFrame

DataFrame containing the data.

required
method str

Method to use for estimating eps. Defaults to 'kneepoint'. Can be one of ['kneepoint', 'mean', 'median'].'kneepoint' estimates the knee of the nearest neighbour distribution to to estimate eps. 'mean' and 'median' use the 1.5 times the mean or median of the nearest neighbour distances respectively.

'kneepoint'
position_columns list[str]

List of column names containing the position data.

['x,y']
frame_column str

Name of the column containing the frame number. Defaults to 't'.

't'
n_neighbors int

Number of nearest neighbours to consider. Defaults to 5.

5
plot bool

Whether to plot the results. Defaults to True.

True
plt_size tuple[int, int]

Size of the plot. Defaults to (5, 5).

(5, 5)
kwargs Any

Keyword arguments for the method. Modify behaviour of respecitve method. For kneepoint: [S online, curve, direction, interp_method,polynomial_degree; For mean: [mean_multiplier] For median [median_multiplier]

{}

Returns:

Name Type Description
Eps float

eps parameter for DBSCAN.

Source code in arcos4py/tools/_detect_events.py
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
def estimate_eps(
    data: pd.DataFrame,
    method: str = 'kneepoint',
    position_columns: list[str] = ['x,y'],
    frame_column: str = 't',
    n_neighbors: int = 5,
    plot: bool = True,
    plt_size: tuple[int, int] = (5, 5),
    max_samples=50_000,
    **kwargs: dict,
):
    """Estimates eps parameter in DBSCAN.

    Estimates the eps parameter for the DBSCAN clustering method, as used by ARCOS,
    by calculating the nearest neighbour distances for each point in the data.
    N_neighbours should be chosen to match the minimum point size in DBSCAN
    or the minimum clustersize in detect_events respectively.
    The method argument determines how the eps parameter is estimated.
    'kneepoint' estimates the knee of the nearest neighbour distribution.
    'mean' and 'median' return (by default) 1.5 times
    the mean or median of the nearest neighbour distances respectively.

    Arguments:
        data (pd.DataFrame): DataFrame containing the data.
        method (str, optional): Method to use for estimating eps. Defaults to 'kneepoint'.
            Can be one of ['kneepoint', 'mean', 'median'].'kneepoint' estimates the knee of the nearest neighbour
            distribution to to estimate eps. 'mean' and 'median' use the 1.5 times the mean or median of the
            nearest neighbour distances respectively.
        position_columns (list[str]): List of column names containing the position data.
        frame_column (str, optional): Name of the column containing the frame number. Defaults to 't'.
        n_neighbors (int, optional): Number of nearest neighbours to consider. Defaults to 5.
        plot (bool, optional): Whether to plot the results. Defaults to True.
        plt_size (tuple[int, int], optional): Size of the plot. Defaults to (5, 5).
        kwargs (Any): Keyword arguments for the method. Modify behaviour of respecitve method.
            For kneepoint: [S online, curve, direction, interp_method,polynomial_degree; For mean: [mean_multiplier]
            For median [median_multiplier]

    Returns:
        Eps (float): eps parameter for DBSCAN.
    """
    method_option = ['kneepoint', 'mean', 'median']

    if method not in method_option:
        raise ValueError(f"Method must be one of {method_option}")

    allowedtypes: dict[str, str] = {
        'kneepoint': 'kneepoint_values',
        'mean': 'mean_values',
        'median': 'median_values',
    }

    kwdefaults: dict[str, Any] = {
        'S': 1,
        'online': True,
        'curve': 'convex',
        'direction': 'increasing',
        'interp_method': 'polynomial',
        'mean_multiplier': 1.5,
        'median_multiplier': 1.5,
        'polynomial_degree': 7,
    }

    kwtypes: dict[str, Any] = {
        'S': int,
        'online': bool,
        'curve': str,
        'direction': str,
        'interp_method': str,
        'polynomial_degree': int,
        'mean_multiplier': (float, int),
        'median_multiplier': (float, int),
        'pos_cols': list,
        'frame_col': str,
    }

    allowedkwargs: dict[str, list[str]] = {
        'kneepoint_values': ['S', 'online', 'curve', 'interp_method', 'direction', 'polynomial_degree'],
        'mean_values': ['mean_multiplier'],
        'median_values': ['median_multiplier'],
    }

    map_deprecated_parameters = {
        'pos_cols': 'position_columns',
        'frame_col': 'frame_column',
    }

    for key in kwargs:
        if key not in allowedkwargs[allowedtypes[method]] and key not in map_deprecated_parameters:
            raise ValueError(f'{key} keyword not in allowed keywords {allowedkwargs[allowedtypes[method]]}')
        if not isinstance(kwargs[key], kwtypes[key]):
            raise ValueError(f'{key} must be of type {kwtypes[key]}')

    # Set kwarg defaults
    for kw in allowedkwargs[allowedtypes[method]]:
        kwargs.setdefault(kw, kwdefaults[kw])

    kwargs = handle_deprecated_params(map_deprecated_parameters, **kwargs)

    # assign parameters
    position_columns = kwargs.get('position_columns', position_columns)  # type: ignore
    frame_column = kwargs.get('frame_column', frame_column)  # type: ignore

    # remove deprecated parameters
    for key in map_deprecated_parameters:
        if key in kwargs:
            del kwargs[key]

    subset = [frame_column] + position_columns
    for i in subset:
        if i not in data.columns:
            raise ValueError(f"Column {i} not in data")

    subset = [frame_column] + position_columns
    data_np = data[subset].to_numpy(dtype=np.float64)
    # sort by frame
    data_np = data_np[data_np[:, 0].argsort()]
    grouped_array = np.split(data_np[:, 1:], np.unique(data_np[:, 0], axis=0, return_index=True)[1][1:])
    # map nearest_neighbours to grouped_array
    distances = [_nearest_neighbour_eps(i, n_neighbors) for i in grouped_array if i.shape[0] >= n_neighbors]
    if not distances:
        distances_array = np.array([])
    else:
        distances_array = np.concatenate(distances)
    # flatten array
    distances_flat = distances_array.flatten()
    distances_flat = distances_flat[np.isfinite(distances_flat)]
    distances_flat_selection = np.random.choice(
        distances_flat, min(max_samples, distances_flat.shape[0]), replace=False
    )
    distances_sorted = np.sort(distances_flat_selection)
    if distances_sorted.shape[0] == 0:
        raise ValueError('No valid distances found, please check input data.')
    if method == 'kneepoint':
        k1 = KneeLocator(
            np.arange(0, distances_sorted.shape[0]),
            distances_sorted,
            S=kwargs['S'],
            online=kwargs['online'],
            curve=kwargs['curve'],
            interp_method=kwargs['interp_method'],
            direction=kwargs['direction'],
            polynomial_degree=kwargs['polynomial_degree'],
        )

        eps = distances_sorted[k1.knee]

    elif method == 'mean':
        eps = np.mean(distances_sorted) * kwargs['mean_multiplier']

    elif method == 'median':
        eps = np.median(distances_sorted) * kwargs['median_multiplier']

    if plot:
        fig, ax = plt.subplots(figsize=plt_size)
        ax.plot(distances_sorted)
        ax.axhline(eps, color='r', linestyle='--')
        ax.set_xlabel('Sorted Distance Index')
        ax.set_ylabel('Nearest Neighbour Distance')
        ax.set_title(f'Estimated eps: {eps:.4f}')
        plt.show()

    return eps

remove_image_background(image, filter_type='gaussian', size=(10, 1, 1), dims='TXY', crop_time_axis=False)

Removes background from images. Assumes axis order (t, y, x) for 2d images and (t, z, y, x) for 3d images.

Parameters:

Name Type Description Default
image ndarray

Image to remove background from.

required
filter_type Union[str, function]

Filter to use to remove background. Can be one of ['median', 'gaussian'].

'gaussian'
size (int, Tuple)

Size of filter to use. For median filter, this is the size of the window. For gaussian filter, this is the standard deviation. If a single int is passed in, it is assumed to be the same for all dimensions. If a tuple is passed in, it is assumed to correspond to the size of the filter in each dimension. Default is (10, 1, 1).

(10, 1, 1)
dims str

Dimensions to apply filter over. Can be one of ['TXY', 'TZXY']. Default is 'TXY'.

'TXY'
crop_time_axis bool

Whether to crop the time axis. Default is True.

False

Returns (np.ndarray): Image with background removed. Along the first axis (t) half of the filter size is removed from the beginning and end respectively.

Source code in arcos4py/tools/_cleandata.py
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
def remove_image_background(
    image: np.ndarray, filter_type: str = 'gaussian', size=(10, 1, 1), dims="TXY", crop_time_axis: bool = False
) -> np.ndarray:
    """Removes background from images. Assumes axis order (t, y, x) for 2d images and (t, z, y, x) for 3d images.

    Arguments:
        image (np.ndarray): Image to remove background from.
        filter_type (Union[str, function]): Filter to use to remove background. Can be one of ['median', 'gaussian'].
        size (int, Tuple): Size of filter to use. For median filter, this is the size of the window.
            For gaussian filter, this is the standard deviation.
            If a single int is passed in, it is assumed to be the same for all dimensions.
            If a tuple is passed in, it is assumed to correspond to the size of the filter in each dimension.
            Default is (10, 1, 1).
        dims (str): Dimensions to apply filter over. Can be one of ['TXY', 'TZXY']. Default is 'TXY'.
        crop_time_axis (bool): Whether to crop the time axis. Default is True.
    Returns (np.ndarray): Image with background removed.
        Along the first axis (t) half of the filter size is removed from the beginning and end respectively.
    """
    # correct images with a filter applied over time
    allowed_filters = ["median", "gaussian"]
    dims_list = list(dims.upper())

    # check input
    for i in dims_list:
        if i not in dims_list:
            raise ValueError(f"Invalid dimension {i}. Must be 'T', 'X', 'Y', or 'Z'.")

    if len(dims_list) > len(set(dims_list)):
        raise ValueError("Duplicate dimensions in dims.")

    if len(dims_list) != image.ndim:
        raise ValueError(
            f"Length of dims must be equal to number of dimensions in image. Image has {image.ndim} dimensions."
        )
    # make sure axis dont occur twice and that they are valid
    if len(dims) != len(set(dims)):
        raise ValueError('Dimensions must not occur twice.')

    if filter_type not in allowed_filters:
        raise ValueError(f'Filter type must be one of {allowed_filters}.')

    # get index of time axis
    t_idx = dims_list.index("T")

    orig_image = image.copy()

    if isinstance(size, int):
        size = (size,) * image.ndim
    elif isinstance(size, tuple):
        if len(size) != image.ndim:
            raise ValueError(f'Filter size must have {image.ndim} dimensions.')
        # check size of dimensions are compatible with image
        for idx, s in enumerate(size):
            if s > image.shape[idx]:
                raise ValueError(f'Filter size in dimension {idx} is larger than image size in that dimension.')
    else:
        raise ValueError('Filter size must be an int or tuple.')

    if filter_type == 'median':
        filtered = median_filter(orig_image, size=size)
    elif filter_type == 'gaussian':
        filtered = gaussian_filter(orig_image, sigma=size)

    # crop time axis if necessary
    shift = size[t_idx] // 2
    corr = np.subtract(orig_image, filtered, dtype=np.float32)
    if crop_time_axis:
        corr = corr[shift:-shift]

    return corr

track_events_dataframe(X, position_columns, frame_column, id_column, binarized_measurement_column=None, clid_column='collid', eps=1.0, eps_prev=None, min_clustersize=3, min_samples=None, clustering_method='dbscan', linking_method='nearest', n_prev=1, predictor=False, n_jobs=1, show_progress=True, **kwargs)

Function to track collective events in a dataframe.

Parameters:

Name Type Description Default
X DataFrame

The input dataframe containing the data to track.

required
position_columns List[str]

The names of the columns representing coordinates.

required
frame_column str

The name of the column containing frame ids.

required
id_column str | None

The name of the column representing IDs. None if no such column.

required
binarized_measurement_column str | None

The name of the column representing binarized measurements, if None all measurements are used.

None
clid_column str

The name of the output column representing collective events, will be generated.

'collid'
eps float

Maximum distance for clustering, default is 1.

1.0
eps_prev float | None

Maximum distance for linking previous clusters, if None, eps is used. Default is None.

None
min_clustersize int

Minimum cluster size. Default is 3.

3
min_samples int

The number of samples (or total weight) in a neighbourhood for a point to be considered as a core point. This includes the point itself. Only used if clusteringMethod is 'hdbscan'. If None, minSamples = minClsz.

None
clustering_method str

The method used for clustering, one of [dbscan, hdbscan]. Default is "dbscan".

'dbscan'
linking_method str

The method used for linking, one of ['nearest', 'transportsolver']. Default is 'nearest'.

'nearest'
n_prev int

Number of previous frames to consider. Default is 1.

1
predictor bool | Callable

Whether or not to use a predictor. Default is False. True uses the default predictor. A callable can be passed to use a custom predictor. See default predictor method for details.

False
n_jobs int

Number of jobs to run in parallel. Default is 1.

1
show_progress bool

Whether or not to show progress bar. Default is True.

True
**kwargs Any

Additional keyword arguments. Includes deprecated parameters for backwards compatibility. - epsPrev: Deprecated parameter for eps_prev. Use eps_prev instead. - minClSz: Deprecated parameter for min_clustersize. Use min_clustersize instead. - minSamples: Deprecated parameter for min_samples. Use min_samples instead. - clusteringMethod: Deprecated parameter for clustering_method. Use clustering_method instead. - linkingMethod: Deprecated parameter for linking_method. Use linking_method instead. - nPrev: Deprecated parameter for n_prev. Use n_prev instead. - nJobs: Deprecated parameter for n_jobs. Use n_jobs instead. - showProgress: Deprecated parameter for show_progress. Use show_progress instead.

{}

Returns:

Type Description
DataFrame

pd.DataFrame: Dataframe with tracked events.

Source code in arcos4py/tools/_detect_events.py
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
def track_events_dataframe(
    X: pd.DataFrame,
    position_columns: List[str],
    frame_column: str,
    id_column: str | None,
    binarized_measurement_column: str | None = None,
    clid_column: str = "collid",
    eps: float = 1.0,
    eps_prev: float | None = None,
    min_clustersize: int = 3,
    min_samples: int | None = None,
    clustering_method: str = "dbscan",
    linking_method: str = 'nearest',
    n_prev: int = 1,
    predictor: bool | Callable = False,
    n_jobs: int = 1,
    show_progress: bool = True,
    **kwargs,
) -> pd.DataFrame:
    """Function to track collective events in a dataframe.

    Arguments:
        X (pd.DataFrame): The input dataframe containing the data to track.
        position_columns (List[str]): The names of the columns representing coordinates.
        frame_column (str): The name of the column containing frame ids.
        id_column (str | None): The name of the column representing IDs. None if no such column.
        binarized_measurement_column (str | None): The name of the column representing binarized measurements,
            if None all measurements are used.
        clid_column (str): The name of the output column representing collective events, will be generated.
        eps (float): Maximum distance for clustering, default is 1.
        eps_prev (float | None): Maximum distance for linking previous clusters, if None, eps is used. Default is None.
        min_clustersize (int): Minimum cluster size. Default is 3.
        min_samples (int): The number of samples (or total weight) in a neighbourhood for a
            point to be considered as a core point. This includes the point itself.
            Only used if clusteringMethod is 'hdbscan'. If None, minSamples =  minClsz.
        clustering_method (str): The method used for clustering, one of [dbscan, hdbscan]. Default is "dbscan".
        linking_method (str): The method used for linking, one of ['nearest', 'transportsolver']. Default is 'nearest'.
        n_prev (int): Number of previous frames to consider. Default is 1.
        predictor (bool | Callable): Whether or not to use a predictor. Default is False.
            True uses the default predictor. A callable can be passed to use a custom predictor.
            See default predictor method for details.
        n_jobs (int): Number of jobs to run in parallel. Default is 1.
        show_progress (bool): Whether or not to show progress bar. Default is True.
        **kwargs (Any): Additional keyword arguments. Includes deprecated parameters for backwards compatibility.
            - epsPrev: Deprecated parameter for eps_prev. Use eps_prev instead.
            - minClSz: Deprecated parameter for min_clustersize. Use min_clustersize instead.
            - minSamples: Deprecated parameter for min_samples. Use min_samples instead.
            - clusteringMethod: Deprecated parameter for clustering_method. Use clustering_method instead.
            - linkingMethod: Deprecated parameter for linking_method. Use linking_method instead.
            - nPrev: Deprecated parameter for n_prev. Use n_prev instead.
            - nJobs: Deprecated parameter for n_jobs. Use n_jobs instead.
            - showProgress: Deprecated parameter for show_progress. Use show_progress instead.

    Returns:
        pd.DataFrame: Dataframe with tracked events.
    """
    map_params = {
        "coordinates_column": "position_columns",
        "bin_meas_column": "binarized_measurement_column",
        "collid_column": "clid_column",
        'epsPrev': 'eps_prev',
        'minClSz': 'min_clustersize',
        'minSamples': 'min_samples',
        'clusteringMethod': 'clustering_method',
        'linkingMethod': 'linking_method',
        'nPrev': 'n_prev',
        'nJobs': 'n_jobs',
        'showProgress': 'show_progress',
    }

    # check for allowed kwargs
    for key in kwargs:
        if key not in map_params.keys():
            raise ValueError(f'Invalid keyword argument {key}')

    # Handle deprecated parameters
    kwargs = handle_deprecated_params(map_params, **kwargs)

    # Assign parameters
    eps_prev = kwargs.get('eps_prev', eps_prev)
    min_clustersize = kwargs.get('min_clustersize', min_clustersize)
    min_samples = kwargs.get('min_samples', min_samples)
    clustering_method = kwargs.get('clustering_method', clustering_method)
    linking_method = kwargs.get('linking_method', linking_method)
    n_prev = kwargs.get('n_prev', n_prev)
    n_jobs = kwargs.get('n_jobs', n_jobs)

    linker = Linker(
        eps=eps,
        eps_prev=eps_prev,
        min_clustersize=min_clustersize,
        min_samples=min_samples,
        clustering_method=clustering_method,
        linking_method=linking_method,
        n_prev=n_prev,
        predictor=predictor,
        n_jobs=n_jobs,
    )
    tracker = DataFrameTracker(
        linker=linker,
        position_columns=position_columns,
        frame_column=frame_column,
        obj_id_column=id_column,
        binarized_measurement_column=binarized_measurement_column,
        clid_column=clid_column,
    )
    df_out = pd.concat(
        [timepoint for timepoint in tqdm(tracker.track(X), total=X[frame_column].nunique(), disable=not show_progress)]
    ).reset_index(drop=True)
    return df_out.query(f"{clid_column} != -1").reset_index(drop=True)

track_events_image(X, eps=1, eps_prev=None, min_clustersize=1, min_samples=None, clustering_method='dbscan', n_prev=1, predictor=False, linking_method='nearest', reg=1, reg_m=10, cost_threshold=0, dims='TXY', downsample=1, n_jobs=1, show_progress=True, **kwargs)

Function to track events in an image using specified linking and clustering methods.

Parameters:

Name Type Description Default
X ndarray

The input array containing the images to track.

required
eps float

Distance for clustering. Default is 1.

1
eps_prev float | None

Maximum distance for linking previous clusters, if None, eps is used. Default is None.

None
min_clustersize int

Minimum cluster size. Default is 1.

1
min_samples int | None

The number of samples (or total weight) in a neighbourhood for a point to be considered as a core point. This includes the point itself. Only used if clusteringMethod is 'hdbscan'. If None, minSamples = minClsz.

None
clustering_method str

The method used for clustering, one of [dbscan, hdbscan]. Default is "dbscan".

'dbscan'
n_prev int

Number of previous frames to consider. Default is 1.

1
predictor bool | Callable

Whether or not to use a predictor. Default is False. True uses the default predictor. A callable can be passed to use a custom predictor. See default predictor method for details.

False
linking_method str

The method used for linking. Default is 'nearest'.

'nearest'
reg float

Entropy regularization parameter for unbalanced OT algorithm (only for transportation linking).

1
reg_m float

Marginal relaxation parameter for unbalanced OT (only for transportation linking).

10
cost_threshold float

Threshold for filtering low-probability matches (only for transportation linking).

0
dims str

String of dimensions in order, such as. Default is "TXY". Possible values are "T", "X", "Y", "Z".

'TXY'
downsample int

Factor by which to downsample the image. Default is 1.

1
n_jobs int

Number of jobs to run in parallel. Default is 1.

1
show_progress bool

Whether or not to show progress bar. Default is True.

True
**kwargs Any

Additional keyword arguments. Includes deprecated parameters for backwards compatibility. - epsPrev: Deprecated parameter for eps_prev. Use eps_prev instead. - minClSz: Deprecated parameter for min_clustersize. Use min_clustersize instead. - minSamples: Deprecated parameter for min_samples. Use min_samples instead. - clusteringMethod: Deprecated parameter for clustering_method. Use clustering_method instead. - linkingMethod: Deprecated parameter for linking_method. Use linking_method instead. - nPrev: Deprecated parameter for n_prev. Use n_prev instead. - nJobs: Deprecated parameter for n_jobs. Use n_jobs instead. - showProgress: Deprecated parameter for show_progress. Use show_progress instead.

{}

Returns:

Type Description
ndarray

np.ndarray: Array of images with tracked events.

Source code in arcos4py/tools/_detect_events.py
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
def track_events_image(
    X: np.ndarray,
    eps: float = 1,
    eps_prev: float | None = None,
    min_clustersize: int = 1,
    min_samples: int | None = None,
    clustering_method: str = "dbscan",
    n_prev: int = 1,
    predictor: bool | Callable = False,
    linking_method: str = 'nearest',
    reg: float = 1,
    reg_m: float = 10,
    cost_threshold: float = 0,
    dims: str = "TXY",
    downsample: int = 1,
    n_jobs: int = 1,
    show_progress: bool = True,
    **kwargs,
) -> np.ndarray:
    """Function to track events in an image using specified linking and clustering methods.

    Arguments:
        X (np.ndarray): The input array containing the images to track.
        eps (float): Distance for clustering. Default is 1.
        eps_prev (float | None): Maximum distance for linking previous clusters, if None, eps is used. Default is None.
        min_clustersize (int): Minimum cluster size. Default is 1.
        min_samples (int | None): The number of samples (or total weight) in a neighbourhood for a
            point to be considered as a core point. This includes the point itself.
            Only used if clusteringMethod is 'hdbscan'. If None, minSamples =  minClsz.
        clustering_method (str): The method used for clustering, one of [dbscan, hdbscan]. Default is "dbscan".
        n_prev (int): Number of previous frames to consider. Default is 1.
        predictor (bool | Callable): Whether or not to use a predictor. Default is False.
            True uses the default predictor. A callable can be passed to use a custom predictor.
            See default predictor method for details.
        linking_method (str): The method used for linking. Default is 'nearest'.
        reg (float): Entropy regularization parameter for unbalanced OT algorithm (only for transportation linking).
        reg_m (float): Marginal relaxation parameter for unbalanced OT (only for transportation linking).
        cost_threshold (float): Threshold for filtering low-probability matches (only for transportation linking).
        dims (str): String of dimensions in order, such as. Default is "TXY". Possible values are "T", "X", "Y", "Z".
        downsample (int): Factor by which to downsample the image. Default is 1.
        n_jobs (int): Number of jobs to run in parallel. Default is 1.
        show_progress (bool): Whether or not to show progress bar. Default is True.
        **kwargs (Any): Additional keyword arguments. Includes deprecated parameters for backwards compatibility.
            - epsPrev: Deprecated parameter for eps_prev. Use eps_prev instead.
            - minClSz: Deprecated parameter for min_clustersize. Use min_clustersize instead.
            - minSamples: Deprecated parameter for min_samples. Use min_samples instead.
            - clusteringMethod: Deprecated parameter for clustering_method. Use clustering_method instead.
            - linkingMethod: Deprecated parameter for linking_method. Use linking_method instead.
            - nPrev: Deprecated parameter for n_prev. Use n_prev instead.
            - nJobs: Deprecated parameter for n_jobs. Use n_jobs instead.
            - showProgress: Deprecated parameter for show_progress. Use show_progress instead.

    Returns:
        np.ndarray: Array of images with tracked events.
    """
    map_params = {
        'epsPrev': 'eps_prev',
        'minClSz': 'min_clustersize',
        'minSamples': 'min_samples',
        'clusteringMethod': 'clustering_method',
        'linkingMethod': 'linking_method',
        'nPrev': 'n_prev',
        'nJobs': 'n_jobs',
        'showProgress': 'show_progress',
    }

    # check for allowed kwargs
    for key in kwargs:
        if key not in map_params.keys():
            raise ValueError(f'Invalid keyword argument {key}')

    # Handle deprecated parameters
    kwargs = handle_deprecated_params(map_params, **kwargs)

    # Assign parameters
    eps_prev = kwargs.get('eps_prev', eps_prev)
    min_clustersize = kwargs.get('min_clustersize', min_clustersize)
    min_samples = kwargs.get('min_samples', min_samples)
    clustering_method = kwargs.get('clustering_method', clustering_method)
    linking_method = kwargs.get('linking_method', linking_method)
    n_prev = kwargs.get('n_prev', n_prev)
    n_jobs = kwargs.get('n_jobs', n_jobs)

    # Determine the dimensionality
    spatial_dims = set("XYZ")
    D = len([d for d in dims if d in spatial_dims])

    # Adjust parameters based on dimensionality
    adjusted_epsPrev = eps_prev / downsample if eps_prev is not None else None
    adjusted_minClSz = int(min_clustersize / (downsample**D))
    adjusted_minSamples = int(min_samples / (downsample**D)) if min_samples is not None else None

    linker = Linker(
        eps=eps / downsample,
        eps_prev=adjusted_epsPrev,
        min_clustersize=adjusted_minClSz,
        min_samples=adjusted_minSamples,
        clustering_method=clustering_method,
        linking_method=linking_method,
        n_prev=n_prev,
        predictor=predictor,
        reg=reg,
        reg_m=reg_m,
        cost_threshold=cost_threshold,
        n_jobs=n_jobs,
    )
    tracker = ImageTracker(linker, downsample=downsample)
    # find indices of T in dims
    T_index = dims.upper().index("T")
    return np.stack(
        [timepoint for timepoint in tqdm(tracker.track(X, dims), total=X.shape[T_index], disable=not show_progress)],
        axis=T_index,
    )

validation

Tools for validating detected collective events.

bootstrap_arcos(df, position_columns=['x'], frame_column='frame', obj_id_column='obj_id', measurement_column='m', method='shuffle_tracks', smooth_k=3, bias_k=51, peak_threshold=0.2, binarization_threshold=0.1, polynomial_degree=1, bias_method='runmed', eps=2, eps_prev=None, min_clustersize=1, n_prev=1, min_duration=1, min_total_size=1, stats_metric=['total_size', 'duration'], pval_alternative='greater', finite_correction=True, n=100, seed=42, allow_duplicates=False, max_tries=100, show_progress=True, verbose=False, parallel_processing=True, plot=True, **kwargs)

Bootstrap data using the ARCOS algorithm.

Parameters:

Name Type Description Default
df DataFrame

DataFrame containing the data to be bootstrapped.

required
position_columns list

List of column names containing the x and y coordinates.

['x']
frame_column str

Name of the column containing the frame number.

'frame'
obj_id_column str

Name of the column containing the track id.

'obj_id'
measurement_column str

Name of the column containing the measurement.

'm'
method str | list[str]

Method used for bootstrapping. Can be "shuffle_tracks", 'shuffle_timepoints', 'shift_timepoints', 'shuffle_binary_blocks', 'shuffle_coordinates_timepoint or a list of methods, which will be applied in order of index.

'shuffle_tracks'
smooth_k int

Smoothing kernel size.

3
bias_k int

Bias kernel size.

51
peak_threshold float

Threshold for peak detection.

0.2
binarization_threshold float

Threshold for binarization.

0.1
polynomial_degree int

Degree of the polynomial used for bias correction.

1
bias_method str

Bias correction method. Can be 'none', 'runmed', 'lm'

'runmed'
eps float

Epsilon parameter for DBSCAN.

2
eps_prev int | None

Parameter for linking tracks. If None, eps is used.

None
min_clustersize int

Minimum cluster size.

1
n_prev int

Number of previous frames to consider for linking.

1
min_duration int

Minimum duration of a track.

1
min_total_size int

Minimum size of a track.

1
stats_metric str | list[str]

Metric to calculate. Can be "duration", "total_size", "min_size", "max_size" or a list of metrics. Default is ["duration", "total_size"].

['total_size', 'duration']
pval_alternative str

Alternative hypothesis for the p-value calculation. Can be "less" or "greater".

'greater'
finite_correction bool

Correct p-values for finite sampling. Default is True.

True
n int

Number of bootstraps.

100
seed int

Seed for the random number generator.

42
allow_duplicates bool

If False, resampling will check if the resampled data contains duplicates. If True, duplicates will be allowed.

False
max_tries int

Maximum number of tries to resample data without duplicates.

100
show_progress bool

Show a progress bar.

True
verbose bool

Print additional information.

False
parallel_processing bool

Use parallel processing.

True
plot bool

Plot the distribution of the bootstrapped data.

True
**kwargs Any

Additional keyword arguments. Includes deprecated parameters. - id_column: Deprecated. Use obj_id_column instead. - meas_column: Deprecated. Use measurement_column instead. - smoothK: Deprecated. Use smooth_k instead. - biasK: Deprecated. Use bias_k instead. - peakThr: Deprecated. Use peak_threshold instead. - binThr: Deprecated. Use binarization_threshold instead. - polyDeg: Deprecated. Use polynomial_degree instead. - biasMet: Deprecated. Use bias_method instead. - epsPrev: Deprecated. Use eps_prev instead. - minClsz: Deprecated. Use min_clustersize instead. - min_size: Deprecated. Use min_total_size instead. - paralell_processing: Deprecated. Use parallel_processing instead.

{}

Returns:

Type Description
DataFrame

DataFrame containing the bootstrapped data.

Source code in arcos4py/validation/_bootstrapping.py
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
def bootstrap_arcos(
    df: pd.DataFrame,
    position_columns: list = ['x'],
    frame_column: str = 'frame',
    obj_id_column: str = 'obj_id',
    measurement_column: str = 'm',
    method: str | list[str] = 'shuffle_tracks',
    smooth_k: int = 3,
    bias_k: int = 51,
    peak_threshold: float = 0.2,
    binarization_threshold: float = 0.1,
    polynomial_degree: int = 1,
    bias_method: str = "runmed",
    eps: float = 2,
    eps_prev: int | None = None,
    min_clustersize: int = 1,
    n_prev: int = 1,
    min_duration: int = 1,
    min_total_size: int = 1,
    stats_metric: str | list[str] = ["total_size", "duration"],
    pval_alternative: str = "greater",
    finite_correction: bool = True,
    n: int = 100,
    seed: int = 42,
    allow_duplicates: bool = False,
    max_tries: int = 100,
    show_progress: bool = True,
    verbose: bool = False,
    parallel_processing: bool = True,
    plot: bool = True,
    **kwargs,
) -> tuple[pd.DataFrame, pd.DataFrame]:
    """Bootstrap data using the ARCOS algorithm.

    Arguments:
        df: DataFrame containing the data to be bootstrapped.
        position_columns: List of column names containing the x and y coordinates.
        frame_column: Name of the column containing the frame number.
        obj_id_column: Name of the column containing the track id.
        measurement_column: Name of the column containing the measurement.
        method: Method used for bootstrapping. Can be "shuffle_tracks", 'shuffle_timepoints', 'shift_timepoints',
            'shuffle_binary_blocks', 'shuffle_coordinates_timepoint or a list of methods,
            which will be applied in order of index.
        smooth_k: Smoothing kernel size.
        bias_k: Bias kernel size.
        peak_threshold: Threshold for peak detection.
        binarization_threshold: Threshold for binarization.
        polynomial_degree: Degree of the polynomial used for bias correction.
        bias_method: Bias correction method. Can be 'none', 'runmed', 'lm'
        eps: Epsilon parameter for DBSCAN.
        eps_prev: Parameter for linking tracks. If None, eps is used.
        min_clustersize: Minimum cluster size.
        n_prev: Number of previous frames to consider for linking.
        min_duration: Minimum duration of a track.
        min_total_size: Minimum size of a track.
        stats_metric: Metric to calculate. Can be "duration", "total_size", "min_size", "max_size" or a list of metrics.
            Default is ["duration", "total_size"].
        pval_alternative: Alternative hypothesis for the p-value calculation. Can be "less" or "greater".
        finite_correction: Correct p-values for finite sampling. Default is True.
        n: Number of bootstraps.
        seed: Seed for the random number generator.
        allow_duplicates: If False, resampling will check if the resampled data contains duplicates.
            If True, duplicates will be allowed.
        max_tries: Maximum number of tries to resample data without duplicates.
        show_progress: Show a progress bar.
        verbose: Print additional information.
        parallel_processing: Use parallel processing.
        plot: Plot the distribution of the bootstrapped data.
        **kwargs (Any): Additional keyword arguments. Includes deprecated parameters.
            - id_column: Deprecated. Use obj_id_column instead.
            - meas_column: Deprecated. Use measurement_column instead.
            - smoothK: Deprecated. Use smooth_k instead.
            - biasK: Deprecated. Use bias_k instead.
            - peakThr: Deprecated. Use peak_threshold instead.
            - binThr: Deprecated. Use binarization_threshold instead.
            - polyDeg: Deprecated. Use polynomial_degree instead.
            - biasMet: Deprecated. Use bias_method instead.
            - epsPrev: Deprecated. Use eps_prev instead.
            - minClsz: Deprecated. Use min_clustersize instead.
            - min_size: Deprecated. Use min_total_size instead.
            - paralell_processing: Deprecated. Use parallel_processing instead.

    Returns:
        DataFrame containing the bootstrapped data.
    """
    map_deprecated_params = {
        "id_column": "obj_id_column",
        "meas_column": "measurement_column",
        "smoothK": "smooth_k",
        "biasK": "bias_k",
        "peakThr": "peak_threshold",
        "binThr": "binarization_threshold",
        "polyDeg": "polynomial_degree",
        "biasMet": "bias_method",
        "epsPrev": "eps_prev",
        "minClsz": "min_clustersize",
        "min_size": "min_total_size",
        "paralell_processing": "parallel_processing",
    }

    # check allowed kwargs
    allowed_kwargs = map_deprecated_params.keys()
    for key in kwargs:
        if key not in allowed_kwargs:
            raise ValueError(f"Got an unexpected keyword argument '{key}'")

    updated_kwargs = handle_deprecated_params(map_deprecated_params, **kwargs)

    # Assigning the parameters
    obj_id_column = updated_kwargs.get("obj_id_column", obj_id_column)
    measurement_column = updated_kwargs.get("measurement_column", measurement_column)
    smooth_k = updated_kwargs.get("smooth_k", smooth_k)
    bias_k = updated_kwargs.get("bias_k", bias_k)
    peak_threshold = updated_kwargs.get("peak_threshold", peak_threshold)
    binarization_threshold = updated_kwargs.get("binarization_threshold", binarization_threshold)
    polynomial_degree = updated_kwargs.get("polynomial_degree", polynomial_degree)
    bias_method = updated_kwargs.get("bias_method", bias_method)
    eps_prev = updated_kwargs.get("eps_prev", eps_prev)
    min_clustersize = updated_kwargs.get("min_clustersize", min_clustersize)
    min_total_size = updated_kwargs.get("min_total_size", min_total_size)
    parallel_processing = updated_kwargs.get("parallel_processing", parallel_processing)

    if not isinstance(stats_metric, list):
        stats_metric = [stats_metric]

    for stats_m in stats_metric:
        if stats_m not in [
            "duration",
            "total_size",
            "min_size",
            "max_size",
        ]:
            raise ValueError(f"Invalid metric: {stats_metric}")

    if pval_alternative not in ["less", "greater"]:
        raise ValueError(f"Invalid alternative hypothesis: {pval_alternative}")

    clid_name = 'clid'

    if isinstance(method, str):
        print(f'Resampling data using method "{method}"...')
    elif isinstance(method, list):
        print(f'Resampling data using methods "{method}"...')

    df_resampled = resample_data(
        data=df,
        position_columns=position_columns,
        frame_column=frame_column,
        obj_id_column=obj_id_column,
        measurement_column=measurement_column,
        method=method,
        n=n,
        seed=seed,
        allow_duplicates=allow_duplicates,
        max_tries=max_tries,
        show_progress=show_progress,
        verbose=verbose,
        parallel_processing=parallel_processing,
    )

    iterations = df_resampled['iteration'].unique()

    print(f'Running ARCOS and calculating "{stats_metric}"...')

    stats_df, stats_df_mean = calculate_arcos_stats(
        df_resampled=df_resampled,
        position_columns=position_columns,
        frame_column=frame_column,
        obj_id_column=obj_id_column,
        measurement_column=measurement_column,
        smooth_k=smooth_k,
        bias_k=bias_k,
        peak_threshold=peak_threshold,
        binarization_threshold=binarization_threshold,
        polynomial_degree=polynomial_degree,
        bias_method=bias_method,
        eps=eps,
        eps_prev=eps_prev,
        min_clustersize=min_clustersize,
        n_prev=n_prev,
        min_duration=min_duration,
        min_total_size=min_total_size,
        stats_metric=stats_metric,
        show_progress=show_progress,
        parallel_processing=parallel_processing,
        clid_column=clid_name,
        iterations=iterations,
    )
    df_p = calculate_pvalue(stats_df_mean, stats_metric, pval_alternative, finite_correction, plot)
    return stats_df, df_p

calculate_arcos_stats(df_resampled, iterations, position_columns=['x'], frame_column='frame', obj_id_column='obj_id', measurement_column='m', smooth_k=3, bias_k=51, peak_threshold=0.2, binarization_threshold=0.1, polynomial_degree=1, bias_method='runmed', eps=2, eps_prev=None, min_clustersize=1, n_prev=1, min_duration=1, min_total_size=1, stats_metric=['duration', 'total_size'], show_progress=True, parallel_processing=True, clid_column='clid', **kwargs)

Calculate the bootstrapped statistics.

Parameters:

Name Type Description Default
df_resampled DataFrame

Dataframe with resampled data.

required
iterations list[int]

List of iteration names, or range.

required
position_columns list

List of position columns..

['x']
frame_column str

Name of the frame column.

'frame'
obj_id_column str

Name of the id column.

'obj_id'
measurement_column str

Name of the measurement column.

'm'
smooth_k int

Smoothing kernel size for local detrending. Defaults to 3.

3
bias_k int

Bias kernel size for large scale detrending (used with biasMet='runmed'). Defaults to 51.

51
peak_threshold float

Peak threshold used for rescaling (used with biasMet='runmed'). Defaults to 0.2.

0.2
binarization_threshold float

Threshold for binarizing measurements after detrending. Defaults to 0.1.

0.1
polynomial_degree int

Polynomial degree used for detrending (used with biasMet='lm'). Defaults to 1.

1
bias_method str

Bias method, can be 'none', 'runmed', 'lm'. Defaults to "runmed".

'runmed'
eps float

Epsilon used for culstering active entities. Defaults to 2.

2
eps_prev int

Epsilon used for linking together culsters across time. Defaults to None.

None
min_clustersize int

Minimum cluster size. Defaults to 1.

1
n_prev int

Number of previous frames to consider when tracking clusters. Defaults to 1.

1
min_duration int

Minimum duration of detected event. Defaults to 1.

1
min_total_size int

Minimum size, minimum size of detected event. Defaults to 1.

1
stats_metric list[str]

List of metrics to calculate. Defaults to ['duration', 'total_size'].

['duration', 'total_size']
show_progress bool

Show progress bar. Defaults to True.

True
parallel_processing bool

Use paralell processing, uses the joblib package. Defaults to True.

True
clid_column str

Name of the cluster id column. Defaults to 'clid'.

'clid'
**kwargs Any

Additional keyword arguments. Includes deprecated parameters. - posCols: Deprecated. Use position_columns instead. - id_column: Deprecated. Use obj_id_column instead. - meas_column: Deprecated. Use measurement_column instead. - smoothK: Deprecated. Use smooth_k instead. - biasK: Deprecated. Use bias_k instead. - peakThr: Deprecated. Use peak_threshold instead. - binThr: Deprecated. Use binarization_threshold instead. - polyDeg: Deprecated. Use polynomial_degree instead. - biasMet: Deprecated. Use bias_method instead. - epsPrev: Deprecated. Use eps_prev instead. - minClsz: Deprecated. Use min_clustersize instead. - min_size: Deprecated. Use min_total_size instead. - nPrev: Deprecated. Use n_prev instead. - paralell_processing: Deprecated. Use parallel_processing instead.

{}

Returns:

Name Type Description
DataFrame DataFrame

Dataframe with the bootstrapped statistics.

DataFrame DataFrame

Dataframe with mean statistics.

Source code in arcos4py/validation/_bootstrapping.py
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
def calculate_arcos_stats(
    df_resampled: pd.DataFrame,
    iterations: list[int],
    position_columns: list = ['x'],
    frame_column: str = 'frame',
    obj_id_column: str = 'obj_id',
    measurement_column: str = 'm',
    smooth_k: int = 3,
    bias_k: int = 51,
    peak_threshold: float = 0.2,
    binarization_threshold: float = 0.1,
    polynomial_degree: int = 1,
    bias_method: str = "runmed",
    eps: float = 2,
    eps_prev: int | None = None,
    min_clustersize: int = 1,
    n_prev: int = 1,
    min_duration: int = 1,
    min_total_size: int = 1,
    stats_metric: list[str] = ['duration', 'total_size'],
    show_progress: bool = True,
    parallel_processing: bool = True,
    clid_column: str = 'clid',
    **kwargs,
):
    """Calculate the bootstrapped statistics.

    Arguments:
        df_resampled (DataFrame): Dataframe with resampled data.
        iterations (list[int]): List of iteration names, or range.
        position_columns (list): List of position columns..
        frame_column (str): Name of the frame column.
        obj_id_column (str): Name of the id column.
        measurement_column (str): Name of the measurement column.
        smooth_k (int, optional): Smoothing kernel size for local detrending. Defaults to 3.
        bias_k (int, optional): Bias kernel size for large scale detrending (used with biasMet='runmed'). Defaults to 51.
        peak_threshold (float, optional): Peak threshold used for rescaling (used with biasMet='runmed'). Defaults to 0.2.
        binarization_threshold (float, optional): Threshold for binarizing measurements after detrending. Defaults to 0.1.
        polynomial_degree (int, optional): Polynomial degree used for detrending (used with biasMet='lm'). Defaults to 1.
        bias_method (str, optional): Bias method, can be 'none', 'runmed', 'lm'. Defaults to "runmed".
        eps (float, optional): Epsilon used for culstering active entities. Defaults to 2.
        eps_prev (int, optional): Epsilon used for linking together culsters across time. Defaults to None.
        min_clustersize (int, optional): Minimum cluster size. Defaults to 1.
        n_prev (int, optional): Number of previous frames to consider when tracking clusters. Defaults to 1.
        min_duration (int, optional): Minimum duration of detected event. Defaults to 1.
        min_total_size (int, optional): Minimum size, minimum size of detected event. Defaults to 1.
        stats_metric (list[str], optional): List of metrics to calculate. Defaults to ['duration', 'total_size'].
        show_progress (bool, optional): Show progress bar. Defaults to True.
        parallel_processing (bool, optional): Use paralell processing, uses the joblib package. Defaults to True.
        clid_column (str, optional): Name of the cluster id column. Defaults to 'clid'.
        **kwargs (Any): Additional keyword arguments. Includes deprecated parameters.
            - posCols: Deprecated. Use position_columns instead.
            - id_column: Deprecated. Use obj_id_column instead.
            - meas_column: Deprecated. Use measurement_column instead.
            - smoothK: Deprecated. Use smooth_k instead.
            - biasK: Deprecated. Use bias_k instead.
            - peakThr: Deprecated. Use peak_threshold instead.
            - binThr: Deprecated. Use binarization_threshold instead.
            - polyDeg: Deprecated. Use polynomial_degree instead.
            - biasMet: Deprecated. Use bias_method instead.
            - epsPrev: Deprecated. Use eps_prev instead.
            - minClsz: Deprecated. Use min_clustersize instead.
            - min_size: Deprecated. Use min_total_size instead.
            - nPrev: Deprecated. Use n_prev instead.
            - paralell_processing: Deprecated. Use parallel_processing instead.

    Returns:
        DataFrame (pd.DataFrame): Dataframe with the bootstrapped statistics.
        DataFrame (pd.DataFrame): Dataframe with mean statistics.
    """
    map_deprecated_params = {
        "posCols": "position_columns",
        "id_column": "obj_id_column",
        "meas_column": "measurement_column",
        "smoothK": "smooth_k",
        "biasK": "bias_k",
        "peakThr": "peak_threshold",
        "binThr": "binarization_threshold",
        "polyDeg": "polynomial_degree",
        "biasMet": "bias_method",
        "epsPrev": "eps_prev",
        "minClsz": "min_clustersize",
        "nPrev": "n_prev",
        "min_size": "min_total_size",
        "paralell_processing": "parallel_processing",
        "clid_name": "clid_column",
    }

    # check allowed kwargs
    allowed_kwargs = map_deprecated_params.keys()
    for key in kwargs:
        if key not in allowed_kwargs:
            raise ValueError(f"Got an unexpected keyword argument '{key}'")

    updated_kwargs = handle_deprecated_params(map_deprecated_params, **kwargs)

    # Assigning the parameters
    position_columns = updated_kwargs.get("position_columns", position_columns)
    obj_id_column = updated_kwargs.get("obj_id_column", obj_id_column)
    measurement_column = updated_kwargs.get("measurement_column", measurement_column)
    smooth_k = updated_kwargs.get("smooth_k", smooth_k)
    bias_k = updated_kwargs.get("bias_k", bias_k)
    peak_threshold = updated_kwargs.get("peak_threshold", peak_threshold)
    binarization_threshold = updated_kwargs.get("binarization_threshold", binarization_threshold)
    polynomial_degree = updated_kwargs.get("polynomial_degree", polynomial_degree)
    bias_method = updated_kwargs.get("bias_method", bias_method)
    min_total_size = updated_kwargs.get("min_total_size", min_total_size)
    parallel_processing = updated_kwargs.get("parallel_processing", parallel_processing)
    clid_column = updated_kwargs.get("clid_column", clid_column)
    min_clustersize = updated_kwargs.get("min_clustersize", min_clustersize)
    eps_prev = updated_kwargs.get("eps_prev", eps_prev)
    n_prev = updated_kwargs.get("n_prev", n_prev)

    if parallel_processing:
        from joblib import Parallel, delayed

        stats_df_list = Parallel(n_jobs=-1)(
            delayed(_apply_arcos)(
                i_iter=i_iter,
                df_resampled=df_resampled,
                position_columns=position_columns,
                frame_column=frame_column,
                obj_id_column=obj_id_column,
                measurement_column=measurement_column,
                smooth_k=smooth_k,
                bias_k=bias_k,
                peak_threshold=peak_threshold,
                binarization_threshold=binarization_threshold,
                polynomial_degree=polynomial_degree,
                bias_method=bias_method,
                eps=eps,
                eps_prev=eps_prev,
                min_clustersize=min_clustersize,
                n_prev=n_prev,
                min_duration=min_duration,
                min_total_size=min_total_size,
                clid_column=clid_column,
            )
            for i_iter in tqdm(iterations, disable=not show_progress)
        )
    else:
        stats_df_list = []
        for i_iter in tqdm(iterations, disable=not show_progress):
            stats_df = _apply_arcos(
                i_iter=i_iter,
                df_resampled=df_resampled,
                position_columns=position_columns,
                frame_column=frame_column,
                obj_id_column=obj_id_column,
                measurement_column=measurement_column,
                smooth_k=smooth_k,
                bias_k=bias_k,
                peak_threshold=peak_threshold,
                binarization_threshold=binarization_threshold,
                polynomial_degree=polynomial_degree,
                bias_method=bias_method,
                eps=eps,
                eps_prev=eps_prev,
                min_clustersize=min_clustersize,
                n_prev=n_prev,
                min_duration=min_duration,
                min_total_size=min_total_size,
                clid_column=clid_column,
            )
            stats_df_list.append(stats_df)

    stats_df = pd.concat(stats_df_list, ignore_index=True)

    stats_df_indexer = ['bootstrap_iteration'] + stats_metric
    stats_df_mean: pd.DataFrame = (
        stats_df[stats_df_indexer].groupby(['bootstrap_iteration']).agg(['mean']).reset_index()
    )
    stats_df_mean = stats_df_mean.droplevel(level=1, axis=1)
    # for bootstrap iteratoins that did not detect any events, set the metric to 0
    stats_df_mean[stats_metric] = stats_df_mean[stats_metric].fillna(0)
    return stats_df, stats_df_mean

calculate_pvalue(stats_df_mean, stats_metric, pval_alternative, finite_correction, plot, **plot_kwargs)

Calculates the p-value with the given alternative hypothesis.

Parameters:

Name Type Description Default
stats_df_mean DataFrame

DataFrame containing the bootstrapped data.

required
stats_metric str | list[str]

Metric to calculate. Can be "duration", "total_size", "min_size", "max_size" or a list of metrics. Default is ["duration", "total_size"].

required
pval_alternative str

Alternative hypothesis for the p-value calculation. Can be "less", "greater" or both which will return p values for both alternatives.

required
finite_correction bool

Correct p-values for finite sampling. Default is True.

required
plot bool

Plot the distribution of the bootstrapped data.

required

Returns:

Name Type Description
DataFrame DataFrame

containing the p-values.

Source code in arcos4py/validation/_bootstrapping.py
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
def calculate_pvalue(
    stats_df_mean: pd.DataFrame,
    stats_metric: str | list[str],
    pval_alternative: str,
    finite_correction: bool,
    plot: bool,
    **plot_kwargs,
):
    """Calculates the p-value with the given alternative hypothesis.

    Arguments:
        stats_df_mean (DataFrame): DataFrame containing the bootstrapped data.
        stats_metric (str | list[str]): Metric to calculate.
            Can be "duration", "total_size", "min_size", "max_size" or a list of metrics.
            Default is ["duration", "total_size"].
        pval_alternative (str): Alternative hypothesis for the p-value calculation.
            Can be "less", "greater" or both which will return p values for both alternatives.
        finite_correction (bool): Correct p-values for finite sampling. Default is True.
        plot (bool): Plot the distribution of the bootstrapped data.

    Returns:
        DataFrame (pd.DataFrame): containing the p-values.
    """
    if finite_correction:
        pval = stats_df_mean[stats_metric].agg(lambda x: _p_val_finite_sampling(x, pval_alternative))
    else:
        pval = stats_df_mean[stats_metric].agg(lambda x: _p_val_infinite_sampling(x, pval_alternative))
    pval.name = 'p_value'

    if isinstance(stats_metric, list):
        _stats_metric = stats_metric
    else:
        _stats_metric = [stats_metric]

    mean_control = stats_df_mean[stats_metric].iloc[0]
    stats_df_mean = stats_df_mean[stats_df_mean['bootstrap_iteration'] != 0].reset_index(drop=True)

    if plot:
        fig, axis = plt.subplots(1, len(_stats_metric))
        try:
            iter(axis)
        except TypeError:
            axis = [axis]
        for idx, (ax, stats_col) in enumerate(zip(axis, _stats_metric)):
            # sns.kdeplot(stats_df_mean[stats_col], ax=ax, shade=True, sharey=True)
            sns.histplot(stats_df_mean[stats_col], ax=ax, kde=True, stat='density', common_norm=False, **plot_kwargs)
            # ax.hist(stats_df_mean[stats_col], alpha=0.5)
            ax.set_title(stats_col)
            ax.vlines(mean_control[stats_col], ymin=0, ymax=ax.get_ylim()[1], color='red', ls='--')
            ax.set_xlabel('Value')
            if len(axis) > 1 and idx == 0:
                ax.set_ylabel('Density')
            else:
                ax.set_ylabel('')
            x_pos = ax.get_xlim()[0] + ((ax.get_xlim()[1] - ax.get_xlim()[0]) * 0.7)
            y_pos = ax.get_ylim()[0] + ((ax.get_ylim()[1] - ax.get_ylim()[0]) * 0.7)
            ax.text(
                x_pos,
                y_pos,
                f'p-value\n{pval[stats_col].values[0]:.3f}',
                ha='center',
                va='center',
                color='red',
            )
        fig.suptitle(f'Bootstrapped metrics: pval_alternative {pval.index[0]}')
        return pval, fig, axis
    return pval

resample_data(data, position_columns=['x'], frame_column='frame', obj_id_column='obj_id', measurement_column=None, method='shuffle_tracks', n=100, seed=42, allow_duplicates=False, max_tries=100, show_progress=True, verbose=False, parallel_processing=True, **kwargs)

Resamples data in order to perform bootstrapping analysis.

Parameters:

Name Type Description Default
data Dataframe

The data to resample.

required
position_columns list

The columns to use for the position.

['x']
frame_column str

The column to use for the frame.

'frame'
obj_id_column str

The column to use for the object ID.

'obj_id'
measurement_column str

The column to use for the measurement. Only needed for 'activity_blocks_shuffle'. Defaults to None.

None
method str

The method to use for resampling. Defaults to 'shuffle_tracks'. Available methods are: "shuffle_tracks", 'shuffle_timepoints', 'shift_timepoints', 'shuffle_binary_blocks', 'shuffle_coordinates_timepoint'

'shuffle_tracks'
n int

The number of resample iterations. Defaults to 100.

100
seed int

The random seed. Defaults to 42.

42
allow_duplicates bool

Whether to allow resampling to randomly generate the same data twice. Defaults to False.

False
max_tries int

The maximum number of tries to try ot generate unique data when allow_duplicates is set to True. Defaults to 100.

100
verbose bool

Whether to print progress. Defaults to False.

False
parallel_processing bool

Whether to use parallel processing. Defaults to True.

True
**kwargs Any

Additional keyword arguments. Includes deprecated parameters. - posCols (list): Deprecated. Use position_columns instead. - id_column (str): Deprecated. Use obj_id_column instead. - meas_column (str): Deprecated. Use measurement_column instead. - paralell_processing (bool): Deprecated. Use parallel_processing instead.

{}

Returns:

Type Description
DataFrame

pd.DataFrame: The resampled data.

Source code in arcos4py/validation/_resampling.py
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
def resample_data(  # noqa: C901
    data: pd.DataFrame,
    position_columns: list = ['x'],
    frame_column: str = 'frame',
    obj_id_column: str = 'obj_id',
    measurement_column: Union[str, None] = None,
    method: Union[str, list[str]] = 'shuffle_tracks',
    n=100,
    seed=42,
    allow_duplicates=False,
    max_tries=100,
    show_progress=True,
    verbose=False,
    parallel_processing=True,
    **kwargs,
) -> pd.DataFrame:
    """Resamples data in order to perform bootstrapping analysis.

    Arguments:
        data (pd.Dataframe): The data to resample.
        position_columns (list): The columns to use for the position.
        frame_column (str): The column to use for the frame.
        obj_id_column (str): The column to use for the object ID.
        measurement_column (str, optional): The column to use for the measurement.
            Only needed for 'activity_blocks_shuffle'. Defaults to None.
        method (str, optional): The method to use for resampling. Defaults to 'shuffle_tracks'.
            Available methods are: "shuffle_tracks", 'shuffle_timepoints',
            'shift_timepoints', 'shuffle_binary_blocks', 'shuffle_coordinates_timepoint'
        n (int, optional): The number of resample iterations. Defaults to 100.
        seed (int, optional): The random seed. Defaults to 42.
        allow_duplicates (bool, optional): Whether to allow resampling to randomly generate the same data twice.
            Defaults to False.
        max_tries (int, optional): The maximum number of tries to try ot generate unique data
            when allow_duplicates is set to True. Defaults to 100.
        verbose (bool, optional): Whether to print progress. Defaults to False.
        parallel_processing (bool, optional): Whether to use parallel processing. Defaults to True.
        **kwargs (Any): Additional keyword arguments. Includes deprecated parameters.
            - posCols (list): Deprecated. Use position_columns instead.
            - id_column (str): Deprecated. Use obj_id_column instead.
            - meas_column (str): Deprecated. Use measurement_column instead.
            - paralell_processing (bool): Deprecated. Use parallel_processing instead.

    Returns:
        pd.DataFrame: The resampled data.
    """
    map_deprecated_params = {
        "posCols": "position_columns",
        "id_column": "obj_id_column",
        "meas_column": "measurement_column",
        "paralell_processing": "parallel_processing",
    }

    # check allowed kwargs
    allowed_kwargs = map_deprecated_params.keys()
    for key in kwargs:
        if key not in allowed_kwargs:
            raise ValueError(f"Got an unexpected keyword argument '{key}'")
    updated_kwargs = handle_deprecated_params(map_deprecated_params, **kwargs)

    position_columns = updated_kwargs.get("position_columns", position_columns)
    obj_id_column = updated_kwargs.get("obj_id_column", obj_id_column)
    measurement_column = updated_kwargs.get("measurement_column", measurement_column)
    parallel_processing = updated_kwargs.get("parallel_processing", parallel_processing)

    # validate the input
    if not isinstance(data, pd.DataFrame):
        raise TypeError('data must be a pandas.DataFrame')
    if not isinstance(position_columns, list):
        raise TypeError('posCols must be a list')
    if not isinstance(frame_column, str):
        raise TypeError('frame_column must be a string')
    if not isinstance(obj_id_column, str):
        raise TypeError('id_column must be a string')
    if not isinstance(measurement_column, str) and measurement_column is not None:
        raise TypeError('meas_column must be a string or None')
    if not isinstance(method, str) and not isinstance(method, list):
        raise TypeError('method must be a string or list')
    if not isinstance(n, int):
        raise TypeError('n must be a positive integer')
    if not isinstance(seed, int):
        raise TypeError('seed must be an integer')
    if not isinstance(verbose, bool):
        raise TypeError('verbose must be a boolean')
    if not isinstance(parallel_processing, bool):
        raise TypeError('paralell_processing must be a boolean')

    if len(position_columns) < 1:
        raise ValueError('posCols must contain at least one column')
    if n < 1:
        raise ValueError('n must be a positive integer')
    if seed < 0:
        raise ValueError('seed must be a positive integer')

    method_dict: dict[str, Callable] = {
        'shuffle_tracks': shuffle_tracks,
        'shuffle_timepoints': shuffle_timepoints,
        'shift_timepoints': shift_timepoints_per_trajectory,
        'shuffle_binary_blocks': shuffle_activity_bocks_per_trajectory,
        'shuffle_coordinates_timepoint': shuffle_coordinates_per_timepoint,
    }

    function_args: dict[str, tuple] = {
        'shuffle_tracks': (obj_id_column, position_columns, frame_column),
        'shuffle_timepoints': (obj_id_column, frame_column),
        'shift_timepoints': (obj_id_column, frame_column),
        'shuffle_binary_blocks': (obj_id_column, frame_column, measurement_column),
        'shuffle_coordinates_timepoint': (position_columns, frame_column),
    }

    resampling_func_list = []

    # convert method to list if necessary
    if isinstance(method, str):
        methods = [method]
    else:
        methods = method

    # Check if the method is valid
    for method in methods:
        if method not in method_dict.keys():
            raise ValueError(f'method must be one of {method_dict.keys()}')
        if method == 'shuffle_binary_blocks' and measurement_column is None:
            raise ValueError('meas_column must be set for binary_blocks_shuffle')

    # Check if the columns are in the data
    if 'shuffle_binary_blocks' in methods:
        relevant_columns = position_columns + [frame_column, obj_id_column, measurement_column]
    else:
        relevant_columns = position_columns + [frame_column, obj_id_column]

    for i in relevant_columns:
        if i not in data.columns:
            raise ValueError(f'{i} not in df.columns')

    # check if there are any Nan in the columns selected
    na_cols = []
    for i in relevant_columns:
        if data[position_columns].isnull().values.any():
            na_cols.append(i)
    if na_cols:
        warnings.warn(f'NaN values in {na_cols}, default behaviour is to drop these rows')
        data.dropna(subset=na_cols, inplace=True)

    # Sort the data
    data.sort_values([obj_id_column, frame_column], inplace=True)

    rng = np.random.default_rng(seed)
    # create a list of random numbers between 0 and 1000000
    seed_list = rng.integers(1_000_000_000, size=n)
    df_out: list[pd.DataFrame] = []
    # shuffle xy position for each object
    if verbose:
        print(f'Resampling for each object {n} times')

    # create a list of functions to call
    for method in methods:
        resampling_func_list.append(method_dict[method])
    iter_range = range(1, n + 1)
    if parallel_processing:
        from joblib import Parallel, delayed

        # iterate over the number of resamples
        df_out = Parallel(n_jobs=-1)(
            delayed(_apply_resampling)(
                iter_number=i,
                data=data,
                methods=methods,
                resampling_func_list=resampling_func_list,
                seed_list=seed_list,
                function_args=function_args,
            )
            for i in tqdm(iter_range, disable=not show_progress)
        )

    else:
        # iterate over the number of resamples
        for i in tqdm(iter_range, disable=not show_progress):
            data_new = _apply_resampling(
                iter_number=i,
                data=data,
                methods=methods,
                resampling_func_list=resampling_func_list,
                seed_list=seed_list,
                function_args=function_args,
            )
            if not allow_duplicates:
                current_try = 0
                # make sure that data_new is not already in df_out,
                # but they are both dataframes, else redo the resampling
                while any(
                    data_new.loc[:, data_new.columns != 'iteration'].equals(i.loc[:, i.columns != 'iteration'])
                    for i in df_out
                ):
                    current_try += 1
                    data_new = _apply_resampling(
                        iter_number=i,
                        data=data,
                        methods=methods,
                        resampling_func_list=resampling_func_list,
                        seed_list=seed_list,
                        function_args=function_args,
                    )
                    if current_try > max_tries:
                        raise ValueError(
                            'Could not find a unique resampling after 100 tries, try increasing n or allow_duplicates'
                        )

            df_out.append(data_new)

    data_it0 = data.copy()
    data_it0['iteration'] = np.repeat(0, len(data_it0))
    df_out.insert(0, data_it0)
    return pd.concat(df_out)[data.columns.tolist() + ['iteration']]