the-momentum · czajkub · Sep 17, 2025 · Sep 12, 2025 · Sep 12, 2025 · Sep 12, 2025
diff --git a/README.md b/README.md
@@ -105,6 +105,10 @@ Follow these steps to set up Apple Health MCP Server in your environment.
 
 4. Lastly, if you're going to be using DuckDB:
    - Run `make duckdb` to create a parquet file with your exported XML data
+   - If you want to connect to the file through http(s):
+     - The only thing you need to do is change the .env path, e.g. `localhost:8080/applehealth.parquet`
+     - If you want an example on how to host the files locally, run `uv run tests/fileserver.py` 
+
 
 ### Configuration Files
 
@@ -235,6 +239,7 @@ The Apple Health MCP Server provides a suite of tools for exploring, searching,
 | `search_health_records_es`  | Flexible search for health records in Elasticsearch with advanced filtering and query options.        |
 | `get_statistics_by_type_es` | Get comprehensive statistics (count, min, max, avg, sum) for a specific health record type.          |
 | `get_trend_data_es`         | Analyze trends for a health record type over time (daily, weekly, monthly, yearly aggregations).     |
+| `search_values_es`          | Search for records with exactly matching values (including text).     |
 
 ### ClickHouse Tools (`ch_reader`)
 
@@ -244,6 +249,7 @@ The Apple Health MCP Server provides a suite of tools for exploring, searching,
 | `search_health_records_ch`  | Flexible search for health records in ClickHouse with advanced filtering and query options.        |
 | `get_statistics_by_type_ch` | Get comprehensive statistics (count, min, max, avg, sum) for a specific health record type.          |
 | `get_trend_data_ch`         | Analyze trends for a health record type over time (daily, weekly, monthly, yearly aggregations).     |
+| `search_values_ch`          | Search for records with exactly matching values (including text).     |
 
 ### DuckDB Tools (`duckdb_reader`)
 
@@ -253,6 +259,7 @@ The Apple Health MCP Server provides a suite of tools for exploring, searching,
 | `search_health_records_duckdb`  | Flexible search for health records in DuckDB with advanced filtering and query options.        |
 | `get_statistics_by_type_duckdb` | Get comprehensive statistics (count, min, max, avg, sum) for a specific health record type.          |
 | `get_trend_data_duckdb`         | Analyze trends for a health record type over time (daily, weekly, monthly, yearly aggregations).     |
+| `search_values_duckdb`          | Search for records with exactly matching values (including text).     |
 
 All tools are accessible via MCP-compatible clients and can be used with natural language or programmatic queries to explore and analyze your Apple Health data.
 

diff --git a/app/config.py b/app/config.py
@@ -30,7 +30,7 @@ class Settings(BaseSettings):
     CH_DB_NAME: str = "applehealth"
     CH_TABLE_NAME: str = "data"
 
-    DUCKDB_FILENAME: str = "applehealth"
+    DUCKDB_FILENAME: str = "applehealth.parquet"
 
     CHUNK_SIZE: int = 50_000
 
@@ -54,7 +54,7 @@ def assemble_cors_origins(cls, v: str | list[str]) -> list[str] | str:
 
 @lru_cache
 def get_settings() -> Settings:
-    return Settings()  # type: ignore[call-arg
+    return Settings()  # type: ignore[call-arg]
 
 
 settings = get_settings()
diff --git a/app/services/duckdb_client.py b/app/services/duckdb_client.py
@@ -2,19 +2,30 @@
 from pathlib import Path
 from typing import Any
 
+import duckdb
 from duckdb import DuckDBPyRelation
 
 from app.config import settings
 
 
 @dataclass
 class DuckDBClient:
-    def __init__(self):
-        self.parquetpath: Path = Path(f"{settings.DUCKDB_FILENAME}.parquet")
+    path: Path | str = f"{settings.DUCKDB_FILENAME}"
 
     def __post_init__(self):
-        if not self.parquetpath.exists():
-            raise FileNotFoundError(f"Parquet file not found: {self.parquetpath}")
+        if self.path.startswith("localhost"):
+            self.path = "http://" + self.path
+
+        if self.path.startswith(("http://", "https://")):
+            duckdb.sql("""
+                    INSTALL httpfs;
+                    LOAD httpfs;
+                """)
+        else:
+            self.path = Path(self.path)
+
+        if isinstance(self.path, Path) and not self.path.exists():
+            raise FileNotFoundError(f"Parquet file not found: {self.path}")
 
     @staticmethod
     def format_response(response: DuckDBPyRelation) -> list[dict[str, Any]]:

diff --git a/app/services/health/duckdb_queries.py b/app/services/health/duckdb_queries.py
@@ -11,15 +11,16 @@
 
 def get_health_summary_from_duckdb() -> list[dict[str, Any]]:
     response = duckdb.sql(
-        f"SELECT type, COUNT(*) AS count FROM read_parquet('{client.parquetpath}') GROUP BY ALL",
+        f"""SELECT type, COUNT(*) AS count FROM read_parquet('{client.path}')
+         GROUP BY type ORDER BY count DESC""",
     )
     return client.format_response(response)
 
 
 def search_health_records_from_duckdb(
     params: HealthRecordSearchParams,
 ) -> list[dict[str, Any]]:
-    query: str = f"SELECT * FROM read_parquet('{client.parquetpath}')"
+    query: str = f"SELECT * FROM read_parquet('{client.path}')"
     query += fill_query(params)
     response = duckdb.sql(query)
     return client.format_response(response)
@@ -31,7 +32,7 @@ def get_statistics_by_type_from_duckdb(
     result = duckdb.sql(f"""
                     SELECT type, COUNT(*) AS count, AVG(value) AS average,
                     SUM(value) AS sum, MIN(value) AS min, MAX(value) AS max
-                    FROM read_parquet('{client.parquetpath}')
+                    FROM read_parquet('{client.path}')
                     WHERE type = '{record_type}' GROUP BY type
                     """)
     return client.format_response(result)
@@ -47,7 +48,7 @@ def get_trend_data_from_duckdb(
         SELECT device, time_bucket(INTERVAL '1 {interval}', startDate) AS interval,
         AVG(value) AS average, SUM(value) AS sum,
         MIN(value) AS min, MAX(value) AS max, COUNT(*) AS count
-        FROM read_parquet('{client.parquetpath}')
+        FROM read_parquet('{client.path}')
         WHERE type = '{record_type}'
         {f"AND startDate >= '{date_from}'" if date_from else ""}
         {f"AND startDate <= '{date_to}'" if date_to else ""}
@@ -63,7 +64,7 @@ def search_values_from_duckdb(
     date_to: str | None = None,
 ) -> list[dict[str, Any]]:
     result = duckdb.sql(f"""
-        SELECT * FROM read_parquet('{client.parquetpath}') WHERE textvalue = '{value}'
+        SELECT * FROM read_parquet('{client.path}') WHERE textvalue = '{value}'
         {f"AND type = '{record_type}'" if record_type else ""}
         {f"AND startDate >= '{date_from}'" if date_from else ""}
         {f"AND startDate <= '{date_to}'" if date_to else ""}

diff --git a/config/.env.example b/config/.env.example
@@ -4,6 +4,6 @@ ES_HOST="localhost"
 CH_DIRNAME="applehealth.chdb"
 CH_DB_NAME="applehealth"
 CH_TABLE_NAME="data"
-DUCKDB_FILENAME="applehealth"
+DUCKDB_FILENAME="applehealth.parquet"
 CHUNK_SIZE="50000"
 RAW_XML_PATH="raw.xml"
diff --git a/scripts/duckdb_importer.py b/scripts/duckdb_importer.py
@@ -35,7 +35,7 @@ def exportxml(self) -> None:
             chunk_dfs.append(df)
 
         combined_df = pl.concat(chunk_dfs)
-        combined_df.write_parquet(f"{self.parquetpath}", compression="zstd")
+        combined_df.write_parquet(f"{self.path}", compression="zstd")
 
         for f in chunkfiles:
             os.remove(f)

diff --git a/scripts/xml_exporter.py b/scripts/xml_exporter.py
@@ -10,7 +10,7 @@
 
 class XMLExporter:
     def __init__(self):
-        self.path: Path = Path(settings.RAW_XML_PATH)
+        self.xmlpath: Path = Path(settings.RAW_XML_PATH)
         self.chunk_size: int = settings.CHUNK_SIZE
 
     DATE_FIELDS: tuple[str, ...] = ("startDate", "endDate", "creationDate")
@@ -62,7 +62,7 @@ def parse_xml(self) -> Generator[DataFrame, Any, None]:
         """
         records: list[dict[str, Any]] = []
 
-        for event, elem in ET.iterparse(self.path, events=("start",)):
+        for event, elem in ET.iterparse(self.xmlpath, events=("start",)):
             if elem.tag == "Record" and event == "start":
                 if len(records) >= self.chunk_size:
                     yield DataFrame(records).reindex(columns=self.COLUMN_NAMES)

diff --git a/tests/fileserver.py b/tests/fileserver.py
@@ -0,0 +1,32 @@
+import argparse
+
+import uvicorn
+from fastapi import FastAPI
+from fastapi.responses import FileResponse
+
+app = FastAPI()
+
+
+@app.get("/{filename}")
+async def serve_file(filename: str) -> FileResponse:
+    return FileResponse(filename)
+
+
+parser = argparse.ArgumentParser(
+    prog="Filesystem server",
+    description="Host local files in this directory on localhost",
+)
+parser.add_argument(
+    "-p",
+    "--port",
+    type=int,
+    help="Port on which to serve",
+    default=8080,
+    dest="port",
+    action="store",
+)
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    port = args.port
+    uvicorn.run(app, host="localhost", port=port)