Browse Source

added functions for selecting from dataframes and splitting a dataframe

main
Holger Frey 2 years ago
parent
commit
331f41ef61
  1. 3
      pyproject.toml
  2. 4
      src/sensospot_tools/__init__.py
  3. 29
      src/sensospot_tools/helpers.py
  4. 74
      src/sensospot_tools/selection.py
  5. 18
      tests/test_helpers.py
  6. 39
      tests/test_selection.py
  7. 45
      tests/test_sensospot_tools.py

3
pyproject.toml

@ -27,8 +27,9 @@ classifiers = [ @@ -27,8 +27,9 @@ classifiers = [
]
dependencies = [
"pandas"
]
[project.urls]
Source = "https://git.cpi.imtek.uni-freiburg.de/holgi/sensospot_tools.git"

4
src/sensospot_tools/__init__.py

@ -5,6 +5,4 @@ Some small tools for working with parsed Sensospot data. @@ -5,6 +5,4 @@ Some small tools for working with parsed Sensospot data.
__version__ = "0.0.1"
def test():
print("works")
from .selection import split, select # noqa: F401

29
src/sensospot_tools/helpers.py

@ -0,0 +1,29 @@ @@ -0,0 +1,29 @@
from typing import Any
def ensure_list(something: Any) -> list[Any]:
"""ensures the provided value is a list or encapsulated in a list
This is intended to use so that where column names should be provided
as a list could also be provided as a single column name
>>> ensure_list("abc")
["abc"]
>>> ensure_list({"a", "b"})
["a", "b"]
>>> ensure_list(1)
[1]
something: the value to be in or the list
returns: a list of whatever something is
"""
# strings are iterables, so here is a special case for them
if isinstance(something, str):
return [something]
try:
return list(something)
except TypeError:
# something is not an iterable
return [something]

74
src/sensospot_tools/selection.py

@ -0,0 +1,74 @@ @@ -0,0 +1,74 @@
from typing import Any, Iterator
import pandas
def select(
data: pandas.DataFrame, column: str, value: Any
) -> pandas.DataFrame:
"""selects a portion of a dataframe based by a value in a column
Example:
>>> print(data)
category value
0 dog 1
1 cat 2
2 horse 3
3 cat 4
>>> print(select(data, "category", "cat"))
category value
1 cat 2
3 cat 4
data: a data DataFrame to select from
column: name of a column in a dataframe
value: rows with this value in the column will be selected
returns: a copy of the DataFrame that has the value in the column
"""
selector = data[column] == value
return data.loc[selector].copy()
def split(
data: pandas.DataFrame, column: str
) -> Iterator[tuple[Any, pandas.DataFrame]]:
"""splits a data frame by unique values in a column
returns an iterator where each result is key-value-pair. The key is the
unique value used for the split, the value is a slice of the dataframe
selected by the unique value contained in the column
Example:
>>> print(data)
category value
0 dog 1
1 cat 2
2 horse 3
3 cat 4
>>> result = dict( split(data, column="category") )
>>> print(result["dog"])
category value
0 dog 1
>>> print(result["cat"])
category value
1 cat 2
3 cat 4
>>> print(result["horse"])
category value
2 horse 3
data: DataFrame to process
column: column identifier to split on unique values
yields: key-value-pairs of
keys: one unique value
values: slice of the dataframe that contains the unique value
"""
unique_values = data[column].unique()
return ((value, select(data, column, value)) for value in unique_values)

18
tests/test_helpers.py

@ -0,0 +1,18 @@ @@ -0,0 +1,18 @@
import pytest
@pytest.mark.parametrize(
"provided, expected",
[
("abc", ["abc"]),
(tuple("abc"), ["a", "b", "c"]),
({"a": 1, "b": 2}, ["a", "b"]),
(1, [1]),
],
)
def test_helpers_ensure_list(provided, expected):
from sensospot_tools.helpers import ensure_list
result = ensure_list(provided)
assert result == expected

39
tests/test_selection.py

@ -0,0 +1,39 @@ @@ -0,0 +1,39 @@
import pytest
CSV_DATA = """
category value
dog 3
cat 55
horse 35
cat 60
horse 9
"""
@pytest.fixture
def example():
import io
import pandas
buffer = io.StringIO(CSV_DATA.strip())
yield pandas.read_csv(buffer, sep="\t")
def test_selection_select(example):
from sensospot_tools.selection import select
result = select(example, "category", "horse")
assert list(result["category"]) == ["horse", "horse"]
assert list(result["value"]) == [35, 9]
def test_selection_split(example):
from sensospot_tools.selection import split
result = dict(split(example, "category"))
assert sorted(result.keys()) == ["cat", "dog", "horse"]
assert list(result["cat"]["value"]) == [55, 60]
assert list(result["dog"]["value"]) == [3]
assert list(result["horse"]["value"]) == [35, 9]

45
tests/test_sensospot_tools.py

@ -1,41 +1,4 @@ @@ -1,41 +1,4 @@
""" Stub file for testing the project
There are three predefined ways to run tests:
make test:
runs only unit tests, that are not marked with "fun" (for functional test)
in a random order. If a test failed before, only the failed tests will be
run. This is intended to be the default testing method while developing.
make testall:
runs unit tests and functional tests in random order. Will give a complete
overview of the test suite.
make coverage:
runs only tests marked with "fun" (for functional tests) and generates a
coverage report for the test run. The idea is to check the test coverage
only on functinal tests to see if a) everything is as much covered as
possible and b) to find dead code that is not called in end-to-end tests.
all three test strategies will run "make lint" before to catch easily made
mistakes.
"""
import pytest
def test_example_unittest():
"""example unittest - try importing the project
will be run by 'make test' and 'make testall' but not 'make coverage'
"""
import sensospot_tools # noqa: F401
@pytest.mark.functional
def test_example_functional_test():
"""example unittest
will be by 'make coverage' and 'make testall' but not 'make test'
"""
assert True
def test_api():
"""test if the provided functionality is importable"""
from sensospot_tools import split # noqa: F401
from sensospot_tools import select # noqa: F401

Loading…
Cancel
Save