From 8a7b91980bdd5921d46a45dbee091f1973099eda Mon Sep 17 00:00:00 2001 From: czajkub Date: Fri, 12 Sep 2025 10:51:43 +0200 Subject: [PATCH 01/13] added sum to trend data --- app/services/health/clickhouse.py | 3 ++- app/services/health/duckdb_queries.py | 7 ++++--- app/services/health/elasticsearch.py | 2 ++ 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/app/services/health/clickhouse.py b/app/services/health/clickhouse.py index d80d87b..b3803a1 100644 --- a/app/services/health/clickhouse.py +++ b/app/services/health/clickhouse.py @@ -33,7 +33,8 @@ def get_trend_data_from_ch( ) -> dict[str, Any]: return ch.inquire(f""" SELECT toStartOfInterval(startDate, INTERVAL 1 {interval}) AS interval, - AVG(value), MIN(value), MAX(value), COUNT(*) FROM {ch.db_name}.{ch.table_name} + AVG(value) AS average, SUM(value) AS sum, MIN(value) AS min, + MAX(value) AS max, COUNT(*) AS count FROM {ch.db_name}.{ch.table_name} WHERE type = '{record_type}' {f"AND startDate >= '{date_from}'" if date_from else ""} {f"AND startDate <= '{date_to}'" if date_to else ""} diff --git a/app/services/health/duckdb_queries.py b/app/services/health/duckdb_queries.py index 6b90fae..7047106 100644 --- a/app/services/health/duckdb_queries.py +++ b/app/services/health/duckdb_queries.py @@ -45,11 +45,12 @@ def get_trend_data_from_duckdb( ) -> list[dict[str, Any]]: result = duckdb.sql(f""" SELECT time_bucket(INTERVAL '1 {interval}', startDate) AS interval, - AVG(value) AS average, MIN(value) AS min, MAX(value) AS max, COUNT(*) AS count + AVG(value) AS average, SUM(value) AS sum, + MIN(value) AS min, MAX(value) AS max, COUNT(*) AS count FROM read_parquet('{client.parquetpath}') WHERE type = '{record_type}' - {f"AND startDate >= '{date_from}'" if date_from else ""} - {f"AND startDate <= '{date_to}'" if date_to else ""} + {f"AND startDate >= '{date_from}'" if date_from else ""} + {f"AND startDate <= '{date_to}'" if date_to else ""} GROUP BY interval ORDER BY interval ASC """) return client.format_response(result) diff --git a/app/services/health/elasticsearch.py b/app/services/health/elasticsearch.py index 9f0d76a..dca5fb1 100644 --- a/app/services/health/elasticsearch.py +++ b/app/services/health/elasticsearch.py @@ -106,6 +106,7 @@ def get_trend_data_logic( "avg_value": {"avg": {"field": "value"}}, "min_value": {"min": {"field": "value"}}, "max_value": {"max": {"field": "value"}}, + "value_sum": {"sum": {"field": "value"}}, "count": {"value_count": {"field": "value"}}, }, }, @@ -121,6 +122,7 @@ def get_trend_data_logic( "avg_value": bucket["avg_value"]["value"], "min_value": bucket["min_value"]["value"], "max_value": bucket["max_value"]["value"], + "value_sum": bucket["value_sum"]["value"], "count": bucket["count"]["value"], }, ) From 7795bf7008dc7a11d4ee2e7b0160c6a7293cfe9e Mon Sep 17 00:00:00 2001 From: czajkub Date: Fri, 12 Sep 2025 11:16:14 +0200 Subject: [PATCH 02/13] added device grouping to duckdb for test --- app/services/health/duckdb_queries.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/services/health/duckdb_queries.py b/app/services/health/duckdb_queries.py index 7047106..9714628 100644 --- a/app/services/health/duckdb_queries.py +++ b/app/services/health/duckdb_queries.py @@ -51,7 +51,7 @@ def get_trend_data_from_duckdb( WHERE type = '{record_type}' {f"AND startDate >= '{date_from}'" if date_from else ""} {f"AND startDate <= '{date_to}'" if date_to else ""} - GROUP BY interval ORDER BY interval ASC + GROUP BY interval, device ORDER BY interval ASC """) return client.format_response(result) From a70831cad29a2ef0b1341b5521502b10bda60ebb Mon Sep 17 00:00:00 2001 From: czajkub Date: Fri, 12 Sep 2025 11:23:50 +0200 Subject: [PATCH 03/13] added device as well to query --- app/services/health/clickhouse.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/services/health/clickhouse.py b/app/services/health/clickhouse.py index b3803a1..e7b3497 100644 --- a/app/services/health/clickhouse.py +++ b/app/services/health/clickhouse.py @@ -32,7 +32,7 @@ def get_trend_data_from_ch( date_to: str | None = None, ) -> dict[str, Any]: return ch.inquire(f""" - SELECT toStartOfInterval(startDate, INTERVAL 1 {interval}) AS interval, + SELECT device, toStartOfInterval(startDate, INTERVAL 1 {interval}) AS interval, AVG(value) AS average, SUM(value) AS sum, MIN(value) AS min, MAX(value) AS max, COUNT(*) AS count FROM {ch.db_name}.{ch.table_name} WHERE type = '{record_type}' From 74c5428cc181f3a077904f80dc7472619a8b95df Mon Sep 17 00:00:00 2001 From: czajkub Date: Fri, 12 Sep 2025 11:33:09 +0200 Subject: [PATCH 04/13] ch and duck device/interval grouping --- app/services/health/clickhouse.py | 5 ++++- app/services/health/duckdb_queries.py | 5 ++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/app/services/health/clickhouse.py b/app/services/health/clickhouse.py index e7b3497..7cf09f5 100644 --- a/app/services/health/clickhouse.py +++ b/app/services/health/clickhouse.py @@ -38,7 +38,7 @@ def get_trend_data_from_ch( WHERE type = '{record_type}' {f"AND startDate >= '{date_from}'" if date_from else ""} {f"AND startDate <= '{date_to}'" if date_to else ""} - GROUP BY interval ORDER BY interval ASC + GROUP BY interval, device ORDER BY interval ASC """) @@ -54,3 +54,6 @@ def search_values_from_ch( {f"AND startDate >= '{date_from}'" if date_from else ""} {f"AND startDate <= '{date_to}'" if date_to else ""} """) + +if __name__ == "__main__": + print(get_trend_data_from_ch("HKQuantityTypeIdentifierStepCount", "week", "2023-03-01", "2023-04-01")) \ No newline at end of file diff --git a/app/services/health/duckdb_queries.py b/app/services/health/duckdb_queries.py index 9714628..d4b0156 100644 --- a/app/services/health/duckdb_queries.py +++ b/app/services/health/duckdb_queries.py @@ -44,7 +44,7 @@ def get_trend_data_from_duckdb( date_to: str | None = None, ) -> list[dict[str, Any]]: result = duckdb.sql(f""" - SELECT time_bucket(INTERVAL '1 {interval}', startDate) AS interval, + SELECT device, time_bucket(INTERVAL '1 {interval}', startDate) AS interval, AVG(value) AS average, SUM(value) AS sum, MIN(value) AS min, MAX(value) AS max, COUNT(*) AS count FROM read_parquet('{client.parquetpath}') @@ -69,3 +69,6 @@ def search_values_from_duckdb( {f"AND startDate <= '{date_to}'" if date_to else ""} """) return client.format_response(result) + +if __name__ == "__main__": + print(get_trend_data_from_duckdb("HKQuantityTypeIdentifierStepCount", "week", "2023-03-01", "2023-04-01")) \ No newline at end of file From 954654c2adca352fff2b3526db59b7164b43dfe5 Mon Sep 17 00:00:00 2001 From: czajkub Date: Fri, 12 Sep 2025 11:53:32 +0200 Subject: [PATCH 05/13] docstring tweak --- app/mcp/v1/tools/ch_reader.py | 5 +++++ app/mcp/v1/tools/duckdb_reader.py | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/app/mcp/v1/tools/ch_reader.py b/app/mcp/v1/tools/ch_reader.py index 6ccf2de..09788a8 100644 --- a/app/mcp/v1/tools/ch_reader.py +++ b/app/mcp/v1/tools/ch_reader.py @@ -142,9 +142,11 @@ def get_trend_data_ch( Returns: - record_type: The analyzed record type + - device: The device on which the data was recorded - interval: The time interval used - trend_data: List of time buckets with statistics for each period: * date: The time period (ISO string) + * value_sum: Sum of values for the period * avg_value: Average value for the period * min_value: Minimum value for the period * max_value: Maximum value for the period @@ -152,6 +154,9 @@ def get_trend_data_ch( Notes for LLMs: - Use this to analyze trends, patterns, and seasonal variations in health data + - Keep in mind that when there is data from multiple devices spanning the same + time period, there is a possibility of data being duplicated. Inform the user + of this possibility if you see multiple devices in the same time period. - The function automatically handles date filtering if date_from/date_to are provided - IMPORTANT - interval must be one of: "day", "week", "month", or "year". Do not use other values. diff --git a/app/mcp/v1/tools/duckdb_reader.py b/app/mcp/v1/tools/duckdb_reader.py index f142ea9..e34bb5b 100644 --- a/app/mcp/v1/tools/duckdb_reader.py +++ b/app/mcp/v1/tools/duckdb_reader.py @@ -142,9 +142,11 @@ def get_trend_data_duckdb( Returns: - record_type: The analyzed record type + - device: The device on which the data was recorded - interval: The time interval used - trend_data: List of time buckets with statistics for each period: * date: The time period (ISO string) + * value_sum: Sum of values for the period * avg_value: Average value for the period * min_value: Minimum value for the period * max_value: Maximum value for the period @@ -152,6 +154,9 @@ def get_trend_data_duckdb( Notes for LLMs: - Use this to analyze trends, patterns, and seasonal variations in health data + - Keep in mind that when there is data from multiple devices spanning the same + time period, there is a possibility of data being duplicated. Inform the user + of this possibility if you see multiple devices in the same time period. - The function automatically handles date filtering if date_from/date_to are provided - IMPORTANT - interval must be one of: "day", "week", "month", or "year". Do not use other values. From 1afd7fea514020d2aea9bc9cf80572e5a512414d Mon Sep 17 00:00:00 2001 From: czajkub Date: Fri, 12 Sep 2025 14:58:55 +0200 Subject: [PATCH 06/13] docstring improving --- app/mcp/v1/tools/ch_reader.py | 2 ++ app/mcp/v1/tools/duckdb_reader.py | 2 ++ app/mcp/v1/tools/es_reader.py | 7 +++++++ 3 files changed, 11 insertions(+) diff --git a/app/mcp/v1/tools/ch_reader.py b/app/mcp/v1/tools/ch_reader.py index 09788a8..71b6a3d 100644 --- a/app/mcp/v1/tools/ch_reader.py +++ b/app/mcp/v1/tools/ch_reader.py @@ -157,6 +157,8 @@ def get_trend_data_ch( - Keep in mind that when there is data from multiple devices spanning the same time period, there is a possibility of data being duplicated. Inform the user of this possibility if you see multiple devices in the same time period. + - If a user asks you to sum up some values from their health records, DO NOT + search for records and write a script to sum them, instead, use this tool. - The function automatically handles date filtering if date_from/date_to are provided - IMPORTANT - interval must be one of: "day", "week", "month", or "year". Do not use other values. diff --git a/app/mcp/v1/tools/duckdb_reader.py b/app/mcp/v1/tools/duckdb_reader.py index e34bb5b..7e367c3 100644 --- a/app/mcp/v1/tools/duckdb_reader.py +++ b/app/mcp/v1/tools/duckdb_reader.py @@ -157,6 +157,8 @@ def get_trend_data_duckdb( - Keep in mind that when there is data from multiple devices spanning the same time period, there is a possibility of data being duplicated. Inform the user of this possibility if you see multiple devices in the same time period. + - If a user asks you to sum up some values from their health records, DO NOT + search for records and write a script to sum them, instead, use this tool. - The function automatically handles date filtering if date_from/date_to are provided - IMPORTANT - interval must be one of: "day", "week", "month", or "year". Do not use other values. diff --git a/app/mcp/v1/tools/es_reader.py b/app/mcp/v1/tools/es_reader.py index 2281819..a092bd2 100644 --- a/app/mcp/v1/tools/es_reader.py +++ b/app/mcp/v1/tools/es_reader.py @@ -142,9 +142,11 @@ def get_trend_data_es( Returns: - record_type: The analyzed record type + - device: The device on which the data was recorded - interval: The time interval used - trend_data: List of time buckets with statistics for each period: * date: The time period (ISO string) + * value_sum: Sum of values for the period * avg_value: Average value for the period * min_value: Minimum value for the period * max_value: Maximum value for the period @@ -152,6 +154,11 @@ def get_trend_data_es( Notes for LLMs: - Use this to analyze trends, patterns, and seasonal variations in health data + - Keep in mind that when there is data from multiple devices spanning the same + time period, there is a possibility of data being duplicated. Inform the user + of this possibility if you see multiple devices in the same time period. + - If a user asks you to sum up some values from their health records, DO NOT + search for records and write a script to sum them, instead, use this tool. - The function automatically handles date filtering if date_from/date_to are provided - IMPORTANT - interval must be one of: "day", "week", "month", or "year". Do not use other values. From 0e42919cb32879e3b53999896fb2d1bd0135b4c0 Mon Sep 17 00:00:00 2001 From: czajkub Date: Fri, 12 Sep 2025 15:34:01 +0200 Subject: [PATCH 07/13] remove debug code --- app/services/health/clickhouse.py | 3 --- app/services/health/duckdb_queries.py | 3 --- 2 files changed, 6 deletions(-) diff --git a/app/services/health/clickhouse.py b/app/services/health/clickhouse.py index 7cf09f5..0a5bcd5 100644 --- a/app/services/health/clickhouse.py +++ b/app/services/health/clickhouse.py @@ -54,6 +54,3 @@ def search_values_from_ch( {f"AND startDate >= '{date_from}'" if date_from else ""} {f"AND startDate <= '{date_to}'" if date_to else ""} """) - -if __name__ == "__main__": - print(get_trend_data_from_ch("HKQuantityTypeIdentifierStepCount", "week", "2023-03-01", "2023-04-01")) \ No newline at end of file diff --git a/app/services/health/duckdb_queries.py b/app/services/health/duckdb_queries.py index d4b0156..72b8bcf 100644 --- a/app/services/health/duckdb_queries.py +++ b/app/services/health/duckdb_queries.py @@ -69,6 +69,3 @@ def search_values_from_duckdb( {f"AND startDate <= '{date_to}'" if date_to else ""} """) return client.format_response(result) - -if __name__ == "__main__": - print(get_trend_data_from_duckdb("HKQuantityTypeIdentifierStepCount", "week", "2023-03-01", "2023-04-01")) \ No newline at end of file From c3cbcb6e1641b23838608142086926d73b6c9c88 Mon Sep 17 00:00:00 2001 From: czajkub Date: Mon, 15 Sep 2025 09:26:55 +0200 Subject: [PATCH 08/13] standardise errors and change trend docstrings --- app/mcp/v1/tools/ch_reader.py | 9 ++++++--- app/mcp/v1/tools/duckdb_reader.py | 11 +++++++---- app/mcp/v1/tools/es_reader.py | 9 ++++++--- 3 files changed, 19 insertions(+), 10 deletions(-) diff --git a/app/mcp/v1/tools/ch_reader.py b/app/mcp/v1/tools/ch_reader.py index 71b6a3d..9043fc7 100644 --- a/app/mcp/v1/tools/ch_reader.py +++ b/app/mcp/v1/tools/ch_reader.py @@ -36,7 +36,7 @@ def get_health_summary_ch() -> dict[str, Any]: try: return get_health_summary_from_ch() except Exception as e: - return {"error": str(e)} + return {"error": f"Failed to get health summary: {str(e)}"} @ch_reader_router.tool @@ -71,7 +71,7 @@ def search_health_records_ch(params: HealthRecordSearchParams) -> dict[str, Any] try: return search_health_records_from_ch(params) except Exception as e: - return {"error": str(e)} + return {"error": f"Failed to search health records: {str(e)}"} @ch_reader_router.tool @@ -158,7 +158,10 @@ def get_trend_data_ch( time period, there is a possibility of data being duplicated. Inform the user of this possibility if you see multiple devices in the same time period. - If a user asks you to sum up some values from their health records, DO NOT - search for records and write a script to sum them, instead, use this tool. + search for records and write a script to sum them, instead, use this tool: + if they ask to sum data from a year, use this tool with date_from set as the + beginning of the year and date_to as the end of the year, with an interval + of 'year' - The function automatically handles date filtering if date_from/date_to are provided - IMPORTANT - interval must be one of: "day", "week", "month", or "year". Do not use other values. diff --git a/app/mcp/v1/tools/duckdb_reader.py b/app/mcp/v1/tools/duckdb_reader.py index 7e367c3..b064cd1 100644 --- a/app/mcp/v1/tools/duckdb_reader.py +++ b/app/mcp/v1/tools/duckdb_reader.py @@ -36,7 +36,7 @@ def get_health_summary_duckdb() -> list[dict[str, Any]]: try: return get_health_summary_from_duckdb() except Exception as e: - return [{"error": str(e)}] + return [{"error": f"Failed to get health summary: {str(e)}"}] @duckdb_reader_router.tool @@ -71,7 +71,7 @@ def search_health_records_duckdb(params: HealthRecordSearchParams) -> list[dict[ try: return search_health_records_from_duckdb(params) except Exception as e: - return [{"error": str(e)}] + return [{"error": f"Failed to search health records: {str(e)}"}] @duckdb_reader_router.tool @@ -158,7 +158,10 @@ def get_trend_data_duckdb( time period, there is a possibility of data being duplicated. Inform the user of this possibility if you see multiple devices in the same time period. - If a user asks you to sum up some values from their health records, DO NOT - search for records and write a script to sum them, instead, use this tool. + search for records and write a script to sum them, instead, use this tool: + if they ask to sum data from a year, use this tool with date_from set as the + beginning of the year and date_to as the end of the year, with an interval + of 'year' - The function automatically handles date filtering if date_from/date_to are provided - IMPORTANT - interval must be one of: "day", "week", "month", or "year". Do not use other values. @@ -214,4 +217,4 @@ def search_values_duckdb( try: return search_values_from_duckdb(record_type, value, date_from, date_to) except Exception as e: - return [{"error": f"Failed to get trend data: {str(e)}"}] + return [{"error": f"Failed to search for values: {str(e)}"}] diff --git a/app/mcp/v1/tools/es_reader.py b/app/mcp/v1/tools/es_reader.py index a092bd2..6cbcccd 100644 --- a/app/mcp/v1/tools/es_reader.py +++ b/app/mcp/v1/tools/es_reader.py @@ -36,7 +36,7 @@ def get_health_summary_es() -> dict[str, Any]: try: return get_health_summary_from_es() except Exception as e: - return {"error": f"Failed to get health summary from ES: {str(e)}"} + return {"error": f"Failed to get health summary: {str(e)}"} @es_reader_router.tool @@ -158,7 +158,10 @@ def get_trend_data_es( time period, there is a possibility of data being duplicated. Inform the user of this possibility if you see multiple devices in the same time period. - If a user asks you to sum up some values from their health records, DO NOT - search for records and write a script to sum them, instead, use this tool. + search for records and write a script to sum them, instead, use this tool: + if they ask to sum data from a year, use this tool with date_from set as the + beginning of the year and date_to as the end of the year, with an interval + of 'year' - The function automatically handles date filtering if date_from/date_to are provided - IMPORTANT - interval must be one of: "day", "week", "month", or "year". Do not use other values. @@ -214,4 +217,4 @@ def search_values_es( try: return search_values_logic(record_type, value, date_from, date_to) except Exception as e: - return [{"error": f"Failed to get trend data: {str(e)}"}] + return [{"error": f"Failed to search for values: {str(e)}"}] From 18e16d0c0337bccb129b9a3bc9c5ce9ff9cb1196 Mon Sep 17 00:00:00 2001 From: czajkub Date: Wed, 17 Sep 2025 10:13:13 +0200 Subject: [PATCH 09/13] add localhost support for parquet also change parquetpath to path and add .parquet suffix to the path in config --- app/config.py | 4 ++-- app/services/duckdb_client.py | 20 ++++++++++++++++---- app/services/health/duckdb_queries.py | 11 ++++++----- config/.env.example | 2 +- scripts/duckdb_importer.py | 2 +- scripts/xml_exporter.py | 4 ++-- 6 files changed, 28 insertions(+), 15 deletions(-) diff --git a/app/config.py b/app/config.py index b2270dc..a2fe6c3 100644 --- a/app/config.py +++ b/app/config.py @@ -30,7 +30,7 @@ class Settings(BaseSettings): CH_DB_NAME: str = "applehealth" CH_TABLE_NAME: str = "data" - DUCKDB_FILENAME: str = "applehealth" + DUCKDB_FILENAME: str = "applehealth.parquet" CHUNK_SIZE: int = 50_000 @@ -54,7 +54,7 @@ def assemble_cors_origins(cls, v: str | list[str]) -> list[str] | str: @lru_cache def get_settings() -> Settings: - return Settings() # type: ignore[call-arg + return Settings() # type: ignore[call-arg] settings = get_settings() diff --git a/app/services/duckdb_client.py b/app/services/duckdb_client.py index c2e7659..c83c8ff 100644 --- a/app/services/duckdb_client.py +++ b/app/services/duckdb_client.py @@ -2,6 +2,7 @@ from pathlib import Path from typing import Any +import duckdb from duckdb import DuckDBPyRelation from app.config import settings @@ -9,12 +10,23 @@ @dataclass class DuckDBClient: - def __init__(self): - self.parquetpath: Path = Path(f"{settings.DUCKDB_FILENAME}.parquet") + path: Path | str = f"{settings.DUCKDB_FILENAME}" def __post_init__(self): - if not self.parquetpath.exists(): - raise FileNotFoundError(f"Parquet file not found: {self.parquetpath}") + print("__post_init__") + if self.path.startswith("localhost"): + self.path = "http://" + self.path + + if self.path.startswith(("http://", "https://")): + duckdb.sql(""" + INSTALL httpfs; + LOAD httpfs; + """) + else: + self.path = Path(self.path) + + if isinstance(self.path, Path) and not self.path.exists(): + raise FileNotFoundError(f"Parquet file not found: {self.path}") @staticmethod def format_response(response: DuckDBPyRelation) -> list[dict[str, Any]]: diff --git a/app/services/health/duckdb_queries.py b/app/services/health/duckdb_queries.py index 72b8bcf..78686c9 100644 --- a/app/services/health/duckdb_queries.py +++ b/app/services/health/duckdb_queries.py @@ -11,7 +11,8 @@ def get_health_summary_from_duckdb() -> list[dict[str, Any]]: response = duckdb.sql( - f"SELECT type, COUNT(*) AS count FROM read_parquet('{client.parquetpath}') GROUP BY ALL", + f"""SELECT type, COUNT(*) AS count FROM read_parquet('{client.path}')" + GROUP BY type ORDER BY count DESC""", ) return client.format_response(response) @@ -19,7 +20,7 @@ def get_health_summary_from_duckdb() -> list[dict[str, Any]]: def search_health_records_from_duckdb( params: HealthRecordSearchParams, ) -> list[dict[str, Any]]: - query: str = f"SELECT * FROM read_parquet('{client.parquetpath}')" + query: str = f"SELECT * FROM read_parquet('{client.path}')" query += fill_query(params) response = duckdb.sql(query) return client.format_response(response) @@ -31,7 +32,7 @@ def get_statistics_by_type_from_duckdb( result = duckdb.sql(f""" SELECT type, COUNT(*) AS count, AVG(value) AS average, SUM(value) AS sum, MIN(value) AS min, MAX(value) AS max - FROM read_parquet('{client.parquetpath}') + FROM read_parquet('{client.path}') WHERE type = '{record_type}' GROUP BY type """) return client.format_response(result) @@ -47,7 +48,7 @@ def get_trend_data_from_duckdb( SELECT device, time_bucket(INTERVAL '1 {interval}', startDate) AS interval, AVG(value) AS average, SUM(value) AS sum, MIN(value) AS min, MAX(value) AS max, COUNT(*) AS count - FROM read_parquet('{client.parquetpath}') + FROM read_parquet('{client.path}') WHERE type = '{record_type}' {f"AND startDate >= '{date_from}'" if date_from else ""} {f"AND startDate <= '{date_to}'" if date_to else ""} @@ -63,7 +64,7 @@ def search_values_from_duckdb( date_to: str | None = None, ) -> list[dict[str, Any]]: result = duckdb.sql(f""" - SELECT * FROM read_parquet('{client.parquetpath}') WHERE textvalue = '{value}' + SELECT * FROM read_parquet('{client.path}') WHERE textvalue = '{value}' {f"AND type = '{record_type}'" if record_type else ""} {f"AND startDate >= '{date_from}'" if date_from else ""} {f"AND startDate <= '{date_to}'" if date_to else ""} diff --git a/config/.env.example b/config/.env.example index 7f9d2a5..035f920 100644 --- a/config/.env.example +++ b/config/.env.example @@ -4,6 +4,6 @@ ES_HOST="localhost" CH_DIRNAME="applehealth.chdb" CH_DB_NAME="applehealth" CH_TABLE_NAME="data" -DUCKDB_FILENAME="applehealth" +DUCKDB_FILENAME="applehealth.parquet" CHUNK_SIZE="50000" RAW_XML_PATH="raw.xml" diff --git a/scripts/duckdb_importer.py b/scripts/duckdb_importer.py index b39929c..138fabb 100644 --- a/scripts/duckdb_importer.py +++ b/scripts/duckdb_importer.py @@ -35,7 +35,7 @@ def exportxml(self) -> None: chunk_dfs.append(df) combined_df = pl.concat(chunk_dfs) - combined_df.write_parquet(f"{self.parquetpath}", compression="zstd") + combined_df.write_parquet(f"{self.path}", compression="zstd") for f in chunkfiles: os.remove(f) diff --git a/scripts/xml_exporter.py b/scripts/xml_exporter.py index 5fffcf1..b7d0c64 100644 --- a/scripts/xml_exporter.py +++ b/scripts/xml_exporter.py @@ -10,7 +10,7 @@ class XMLExporter: def __init__(self): - self.path: Path = Path(settings.RAW_XML_PATH) + self.xmlpath: Path = Path(settings.RAW_XML_PATH) self.chunk_size: int = settings.CHUNK_SIZE DATE_FIELDS: tuple[str, ...] = ("startDate", "endDate", "creationDate") @@ -62,7 +62,7 @@ def parse_xml(self) -> Generator[DataFrame, Any, None]: """ records: list[dict[str, Any]] = [] - for event, elem in ET.iterparse(self.path, events=("start",)): + for event, elem in ET.iterparse(self.xmlpath, events=("start",)): if elem.tag == "Record" and event == "start": if len(records) >= self.chunk_size: yield DataFrame(records).reindex(columns=self.COLUMN_NAMES) From fab22eb91507081e5a1c39009a507425bfc7c753 Mon Sep 17 00:00:00 2001 From: czajkub Date: Wed, 17 Sep 2025 11:44:23 +0200 Subject: [PATCH 10/13] remove debug from client --- app/services/duckdb_client.py | 1 - 1 file changed, 1 deletion(-) diff --git a/app/services/duckdb_client.py b/app/services/duckdb_client.py index c83c8ff..5ebb4be 100644 --- a/app/services/duckdb_client.py +++ b/app/services/duckdb_client.py @@ -13,7 +13,6 @@ class DuckDBClient: path: Path | str = f"{settings.DUCKDB_FILENAME}" def __post_init__(self): - print("__post_init__") if self.path.startswith("localhost"): self.path = "http://" + self.path From b619143de480fd6b36ae8e4151cf530b8186e47c Mon Sep 17 00:00:00 2001 From: czajkub Date: Wed, 17 Sep 2025 13:33:25 +0200 Subject: [PATCH 11/13] unterminated string --- app/services/health/duckdb_queries.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/app/services/health/duckdb_queries.py b/app/services/health/duckdb_queries.py index 78686c9..52fe471 100644 --- a/app/services/health/duckdb_queries.py +++ b/app/services/health/duckdb_queries.py @@ -11,7 +11,7 @@ def get_health_summary_from_duckdb() -> list[dict[str, Any]]: response = duckdb.sql( - f"""SELECT type, COUNT(*) AS count FROM read_parquet('{client.path}')" + f"""SELECT type, COUNT(*) AS count FROM read_parquet('{client.path}') GROUP BY type ORDER BY count DESC""", ) return client.format_response(response) @@ -70,3 +70,6 @@ def search_values_from_duckdb( {f"AND startDate <= '{date_to}'" if date_to else ""} """) return client.format_response(result) + +if __name__=="__main__": + print(get_health_summary_from_duckdb()) \ No newline at end of file From 1c1678f2d35f8e9354012e87cc54fc7ecce89a03 Mon Sep 17 00:00:00 2001 From: czajkub Date: Wed, 17 Sep 2025 14:31:00 +0200 Subject: [PATCH 12/13] remove debug and add fileserver example --- app/services/health/duckdb_queries.py | 3 --- tests/fileserver.py | 32 +++++++++++++++++++++++++++ 2 files changed, 32 insertions(+), 3 deletions(-) create mode 100644 tests/fileserver.py diff --git a/app/services/health/duckdb_queries.py b/app/services/health/duckdb_queries.py index 52fe471..4cd9937 100644 --- a/app/services/health/duckdb_queries.py +++ b/app/services/health/duckdb_queries.py @@ -70,6 +70,3 @@ def search_values_from_duckdb( {f"AND startDate <= '{date_to}'" if date_to else ""} """) return client.format_response(result) - -if __name__=="__main__": - print(get_health_summary_from_duckdb()) \ No newline at end of file diff --git a/tests/fileserver.py b/tests/fileserver.py new file mode 100644 index 0000000..462d82d --- /dev/null +++ b/tests/fileserver.py @@ -0,0 +1,32 @@ +import argparse + +import uvicorn +from fastapi import FastAPI +from fastapi.responses import FileResponse + +app = FastAPI() + + +@app.get("/{filename}") +async def serve_file(filename: str) -> FileResponse: + return FileResponse(filename) + + +parser = argparse.ArgumentParser( + prog="Filesystem server", + description="Host local files in this directory on localhost", +) +parser.add_argument( + "-p", + "--port", + type=int, + help="Port on which to serve", + default=8080, + dest="port", + action="store", +) + +if __name__ == "__main__": + args = parser.parse_args() + port = args.port + uvicorn.run(app, host="localhost", port=port) From bd0bb509a4f6854613c07efb909a5b76730921f1 Mon Sep 17 00:00:00 2001 From: Jakub Czajka Date: Wed, 17 Sep 2025 14:37:53 +0200 Subject: [PATCH 13/13] Update README.md --- README.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/README.md b/README.md index 6db3c71..0e8064d 100644 --- a/README.md +++ b/README.md @@ -105,6 +105,10 @@ Follow these steps to set up Apple Health MCP Server in your environment. 4. Lastly, if you're going to be using DuckDB: - Run `make duckdb` to create a parquet file with your exported XML data + - If you want to connect to the file through http(s): + - The only thing you need to do is change the .env path, e.g. `localhost:8080/applehealth.parquet` + - If you want an example on how to host the files locally, run `uv run tests/fileserver.py` + ### Configuration Files @@ -235,6 +239,7 @@ The Apple Health MCP Server provides a suite of tools for exploring, searching, | `search_health_records_es` | Flexible search for health records in Elasticsearch with advanced filtering and query options. | | `get_statistics_by_type_es` | Get comprehensive statistics (count, min, max, avg, sum) for a specific health record type. | | `get_trend_data_es` | Analyze trends for a health record type over time (daily, weekly, monthly, yearly aggregations). | +| `search_values_es` | Search for records with exactly matching values (including text). | ### ClickHouse Tools (`ch_reader`) @@ -244,6 +249,7 @@ The Apple Health MCP Server provides a suite of tools for exploring, searching, | `search_health_records_ch` | Flexible search for health records in ClickHouse with advanced filtering and query options. | | `get_statistics_by_type_ch` | Get comprehensive statistics (count, min, max, avg, sum) for a specific health record type. | | `get_trend_data_ch` | Analyze trends for a health record type over time (daily, weekly, monthly, yearly aggregations). | +| `search_values_ch` | Search for records with exactly matching values (including text). | ### DuckDB Tools (`duckdb_reader`) @@ -253,6 +259,7 @@ The Apple Health MCP Server provides a suite of tools for exploring, searching, | `search_health_records_duckdb` | Flexible search for health records in DuckDB with advanced filtering and query options. | | `get_statistics_by_type_duckdb` | Get comprehensive statistics (count, min, max, avg, sum) for a specific health record type. | | `get_trend_data_duckdb` | Analyze trends for a health record type over time (daily, weekly, monthly, yearly aggregations). | +| `search_values_duckdb` | Search for records with exactly matching values (including text). | All tools are accessible via MCP-compatible clients and can be used with natural language or programmatic queries to explore and analyze your Apple Health data.