diff --git a/README.md b/README.md index 6db3c71..0e8064d 100644 --- a/README.md +++ b/README.md @@ -105,6 +105,10 @@ Follow these steps to set up Apple Health MCP Server in your environment. 4. Lastly, if you're going to be using DuckDB: - Run `make duckdb` to create a parquet file with your exported XML data + - If you want to connect to the file through http(s): + - The only thing you need to do is change the .env path, e.g. `localhost:8080/applehealth.parquet` + - If you want an example on how to host the files locally, run `uv run tests/fileserver.py` + ### Configuration Files @@ -235,6 +239,7 @@ The Apple Health MCP Server provides a suite of tools for exploring, searching, | `search_health_records_es` | Flexible search for health records in Elasticsearch with advanced filtering and query options. | | `get_statistics_by_type_es` | Get comprehensive statistics (count, min, max, avg, sum) for a specific health record type. | | `get_trend_data_es` | Analyze trends for a health record type over time (daily, weekly, monthly, yearly aggregations). | +| `search_values_es` | Search for records with exactly matching values (including text). | ### ClickHouse Tools (`ch_reader`) @@ -244,6 +249,7 @@ The Apple Health MCP Server provides a suite of tools for exploring, searching, | `search_health_records_ch` | Flexible search for health records in ClickHouse with advanced filtering and query options. | | `get_statistics_by_type_ch` | Get comprehensive statistics (count, min, max, avg, sum) for a specific health record type. | | `get_trend_data_ch` | Analyze trends for a health record type over time (daily, weekly, monthly, yearly aggregations). | +| `search_values_ch` | Search for records with exactly matching values (including text). | ### DuckDB Tools (`duckdb_reader`) @@ -253,6 +259,7 @@ The Apple Health MCP Server provides a suite of tools for exploring, searching, | `search_health_records_duckdb` | Flexible search for health records in DuckDB with advanced filtering and query options. | | `get_statistics_by_type_duckdb` | Get comprehensive statistics (count, min, max, avg, sum) for a specific health record type. | | `get_trend_data_duckdb` | Analyze trends for a health record type over time (daily, weekly, monthly, yearly aggregations). | +| `search_values_duckdb` | Search for records with exactly matching values (including text). | All tools are accessible via MCP-compatible clients and can be used with natural language or programmatic queries to explore and analyze your Apple Health data. diff --git a/app/config.py b/app/config.py index b2270dc..a2fe6c3 100644 --- a/app/config.py +++ b/app/config.py @@ -30,7 +30,7 @@ class Settings(BaseSettings): CH_DB_NAME: str = "applehealth" CH_TABLE_NAME: str = "data" - DUCKDB_FILENAME: str = "applehealth" + DUCKDB_FILENAME: str = "applehealth.parquet" CHUNK_SIZE: int = 50_000 @@ -54,7 +54,7 @@ def assemble_cors_origins(cls, v: str | list[str]) -> list[str] | str: @lru_cache def get_settings() -> Settings: - return Settings() # type: ignore[call-arg + return Settings() # type: ignore[call-arg] settings = get_settings() diff --git a/app/services/duckdb_client.py b/app/services/duckdb_client.py index c2e7659..5ebb4be 100644 --- a/app/services/duckdb_client.py +++ b/app/services/duckdb_client.py @@ -2,6 +2,7 @@ from pathlib import Path from typing import Any +import duckdb from duckdb import DuckDBPyRelation from app.config import settings @@ -9,12 +10,22 @@ @dataclass class DuckDBClient: - def __init__(self): - self.parquetpath: Path = Path(f"{settings.DUCKDB_FILENAME}.parquet") + path: Path | str = f"{settings.DUCKDB_FILENAME}" def __post_init__(self): - if not self.parquetpath.exists(): - raise FileNotFoundError(f"Parquet file not found: {self.parquetpath}") + if self.path.startswith("localhost"): + self.path = "http://" + self.path + + if self.path.startswith(("http://", "https://")): + duckdb.sql(""" + INSTALL httpfs; + LOAD httpfs; + """) + else: + self.path = Path(self.path) + + if isinstance(self.path, Path) and not self.path.exists(): + raise FileNotFoundError(f"Parquet file not found: {self.path}") @staticmethod def format_response(response: DuckDBPyRelation) -> list[dict[str, Any]]: diff --git a/app/services/health/duckdb_queries.py b/app/services/health/duckdb_queries.py index 72b8bcf..4cd9937 100644 --- a/app/services/health/duckdb_queries.py +++ b/app/services/health/duckdb_queries.py @@ -11,7 +11,8 @@ def get_health_summary_from_duckdb() -> list[dict[str, Any]]: response = duckdb.sql( - f"SELECT type, COUNT(*) AS count FROM read_parquet('{client.parquetpath}') GROUP BY ALL", + f"""SELECT type, COUNT(*) AS count FROM read_parquet('{client.path}') + GROUP BY type ORDER BY count DESC""", ) return client.format_response(response) @@ -19,7 +20,7 @@ def get_health_summary_from_duckdb() -> list[dict[str, Any]]: def search_health_records_from_duckdb( params: HealthRecordSearchParams, ) -> list[dict[str, Any]]: - query: str = f"SELECT * FROM read_parquet('{client.parquetpath}')" + query: str = f"SELECT * FROM read_parquet('{client.path}')" query += fill_query(params) response = duckdb.sql(query) return client.format_response(response) @@ -31,7 +32,7 @@ def get_statistics_by_type_from_duckdb( result = duckdb.sql(f""" SELECT type, COUNT(*) AS count, AVG(value) AS average, SUM(value) AS sum, MIN(value) AS min, MAX(value) AS max - FROM read_parquet('{client.parquetpath}') + FROM read_parquet('{client.path}') WHERE type = '{record_type}' GROUP BY type """) return client.format_response(result) @@ -47,7 +48,7 @@ def get_trend_data_from_duckdb( SELECT device, time_bucket(INTERVAL '1 {interval}', startDate) AS interval, AVG(value) AS average, SUM(value) AS sum, MIN(value) AS min, MAX(value) AS max, COUNT(*) AS count - FROM read_parquet('{client.parquetpath}') + FROM read_parquet('{client.path}') WHERE type = '{record_type}' {f"AND startDate >= '{date_from}'" if date_from else ""} {f"AND startDate <= '{date_to}'" if date_to else ""} @@ -63,7 +64,7 @@ def search_values_from_duckdb( date_to: str | None = None, ) -> list[dict[str, Any]]: result = duckdb.sql(f""" - SELECT * FROM read_parquet('{client.parquetpath}') WHERE textvalue = '{value}' + SELECT * FROM read_parquet('{client.path}') WHERE textvalue = '{value}' {f"AND type = '{record_type}'" if record_type else ""} {f"AND startDate >= '{date_from}'" if date_from else ""} {f"AND startDate <= '{date_to}'" if date_to else ""} diff --git a/config/.env.example b/config/.env.example index 7f9d2a5..035f920 100644 --- a/config/.env.example +++ b/config/.env.example @@ -4,6 +4,6 @@ ES_HOST="localhost" CH_DIRNAME="applehealth.chdb" CH_DB_NAME="applehealth" CH_TABLE_NAME="data" -DUCKDB_FILENAME="applehealth" +DUCKDB_FILENAME="applehealth.parquet" CHUNK_SIZE="50000" RAW_XML_PATH="raw.xml" diff --git a/scripts/duckdb_importer.py b/scripts/duckdb_importer.py index b39929c..138fabb 100644 --- a/scripts/duckdb_importer.py +++ b/scripts/duckdb_importer.py @@ -35,7 +35,7 @@ def exportxml(self) -> None: chunk_dfs.append(df) combined_df = pl.concat(chunk_dfs) - combined_df.write_parquet(f"{self.parquetpath}", compression="zstd") + combined_df.write_parquet(f"{self.path}", compression="zstd") for f in chunkfiles: os.remove(f) diff --git a/scripts/xml_exporter.py b/scripts/xml_exporter.py index 5fffcf1..b7d0c64 100644 --- a/scripts/xml_exporter.py +++ b/scripts/xml_exporter.py @@ -10,7 +10,7 @@ class XMLExporter: def __init__(self): - self.path: Path = Path(settings.RAW_XML_PATH) + self.xmlpath: Path = Path(settings.RAW_XML_PATH) self.chunk_size: int = settings.CHUNK_SIZE DATE_FIELDS: tuple[str, ...] = ("startDate", "endDate", "creationDate") @@ -62,7 +62,7 @@ def parse_xml(self) -> Generator[DataFrame, Any, None]: """ records: list[dict[str, Any]] = [] - for event, elem in ET.iterparse(self.path, events=("start",)): + for event, elem in ET.iterparse(self.xmlpath, events=("start",)): if elem.tag == "Record" and event == "start": if len(records) >= self.chunk_size: yield DataFrame(records).reindex(columns=self.COLUMN_NAMES) diff --git a/tests/fileserver.py b/tests/fileserver.py new file mode 100644 index 0000000..462d82d --- /dev/null +++ b/tests/fileserver.py @@ -0,0 +1,32 @@ +import argparse + +import uvicorn +from fastapi import FastAPI +from fastapi.responses import FileResponse + +app = FastAPI() + + +@app.get("/{filename}") +async def serve_file(filename: str) -> FileResponse: + return FileResponse(filename) + + +parser = argparse.ArgumentParser( + prog="Filesystem server", + description="Host local files in this directory on localhost", +) +parser.add_argument( + "-p", + "--port", + type=int, + help="Port on which to serve", + default=8080, + dest="port", + action="store", +) + +if __name__ == "__main__": + args = parser.parse_args() + port = args.port + uvicorn.run(app, host="localhost", port=port)