Skip to content

Commit 6542740

Browse files
authored
Change duckdb import from .parquet to .duckdb (#46)
Import xml data to a native .duckdb database with 3 tables instead of parquet files Old import method still exists unused with a changed path
1 parent d615229 commit 6542740

File tree

13 files changed

+553
-126
lines changed

13 files changed

+553
-126
lines changed

.pre-commit-config.yaml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,3 +20,10 @@ repos:
2020
hooks:
2121
- id: trailing-whitespace
2222
- id: end-of-file-fixer
23+
24+
exclude: |
25+
(?x)(
26+
^tests/
27+
^docs/
28+
^README.md/
29+
)

app/config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ class Settings(BaseSettings):
3030
CH_DB_NAME: str = "applehealth"
3131
CH_TABLE_NAME: str = "data"
3232

33-
DUCKDB_FILENAME: str = "applehealth.parquet"
33+
DUCKDB_FILENAME: str = "applehealth.duckdb"
3434

3535
CHUNK_SIZE: int = 50_000
3636

app/mcp/v1/tools/duckdb_reader.py

Lines changed: 25 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
from fastmcp import FastMCP
44

5-
from app.schemas.record import HealthRecordSearchParams, IntervalType, RecordType
5+
from app.schemas.record import HealthRecordSearchParams, IntervalType, RecordType, WorkoutType
66
from app.services.health.duckdb_queries import (
77
get_health_summary_from_duckdb,
88
get_statistics_by_type_from_duckdb,
@@ -11,7 +11,7 @@
1111
search_values_from_duckdb,
1212
)
1313

14-
duckdb_reader_router = FastMCP(name="CH Reader MCP")
14+
duckdb_reader_router = FastMCP(name="DuckDB Reader MCP")
1515

1616

1717
@duckdb_reader_router.tool
@@ -23,14 +23,16 @@ def get_health_summary_duckdb() -> list[dict[str, Any]]:
2323
2424
Notes for LLM:
2525
- IMPORTANT - Do not guess, autofill, or assume any missing data.
26-
- If there are multiple databases available (DuckDB, ClickHouse, Elasticsearch):
26+
- Use this tool if you're not certain of the record type that
27+
should be called
28+
- If there are multiple databases available (DuckDB, Elasticsearch):
2729
first, ask the user which one he wants to use. DO NOT call any tools before
2830
the user specifies his intent.
2931
- If the user decides on an option, only use tools from this database,
3032
do not switch over to another until the user specifies that he wants
3133
to use a different one. You do not have to keep asking whether
3234
the user wants to use the same database that he used before.
33-
- If there is only one database available (DuckDB, ClickHouse, Elasticsearch):
35+
- If there is only one database available (DuckDB, Elasticsearch):
3436
you can use the tools from this database without the user specifying it.
3537
"""
3638
try:
@@ -46,10 +48,11 @@ def search_health_records_duckdb(params: HealthRecordSearchParams) -> list[dict[
4648
4749
Parameters:
4850
- params: HealthRecordSearchParams object containing all search/filter parameters.
51+
(required parameters: record_type)
4952
5053
Notes for LLMs:
5154
- This function should return a list of health record documents (dicts)
52-
matching the search criteria.
55+
matching the search criteria ordered by date from most to least recent.
5356
- Each document in the list should represent a single health record as stored in ClickHouse.
5457
- If an error occurs, the function should return a list with a single dict
5558
containing an 'error' key and the error message.
@@ -58,14 +61,16 @@ def search_health_records_duckdb(params: HealthRecordSearchParams) -> list[dict[
5861
- Example date_from/date_to: "2020-01-01T00:00:00+00:00"
5962
- Example value_min/value_max: "10", "100.5"
6063
- IMPORTANT - Do not guess, autofill, or assume any missing data.
61-
- If there are multiple databases available (DuckDB, ClickHouse, Elasticsearch):
64+
- This tool can be used to search for most recent records of a given type,
65+
in which case you should use this tool with a limit of 1.
66+
- If there are multiple databases available (DuckDB, Elasticsearch):
6267
first, ask the user which one he wants to use. DO NOT call any tools before
6368
the user specifies his intent.
6469
- If the user decides on an option, only use tools from this database,
6570
do not switch over to another until the user specifies that he wants
6671
to use a different one. You do not have to keep asking whether
6772
the user wants to use the same database that he used before.
68-
- If there is only one database available (DuckDB, ClickHouse, Elasticsearch):
73+
- If there is only one database available (DuckDB, Elasticsearch):
6974
you can use the tools from this database without the user specifying it.
7075
"""
7176
try:
@@ -75,7 +80,9 @@ def search_health_records_duckdb(params: HealthRecordSearchParams) -> list[dict[
7580

7681

7782
@duckdb_reader_router.tool
78-
def get_statistics_by_type_duckdb(record_type: RecordType | str) -> list[dict[str, Any]]:
83+
def get_statistics_by_type_duckdb(
84+
record_type: RecordType | WorkoutType | str,
85+
) -> list[dict[str, Any]]:
7986
"""
8087
Get comprehensive statistics for a specific health record type from DuckDB.
8188
@@ -105,17 +112,19 @@ def get_statistics_by_type_duckdb(record_type: RecordType | str) -> list[dict[st
105112
specific health metrics.
106113
- The function is useful for health analysis, identifying outliers, and
107114
understanding data quality.
115+
- This tool can also be used to figure out the value of the record with
116+
the shortest/longest duration or highest/lowest value
108117
- date_range key for query is commented, since it contained hardcoded from
109118
date, but you can use it anyway if you replace startDate with your data.
110119
- IMPORTANT - Do not guess, autofill, or assume any missing data.
111-
- If there are multiple databases available (DuckDB, ClickHouse, Elasticsearch):
120+
- If there are multiple databases available (DuckDB, Elasticsearch):
112121
first, ask the user which one he wants to use. DO NOT call any tools before
113122
the user specifies his intent.
114123
- If the user decides on an option, only use tools from this database,
115124
do not switch over to another until the user specifies that he wants
116125
to use a different one. You do not have to keep asking whether
117126
the user wants to use the same database that he used before.
118-
- If there is only one database available (DuckDB, ClickHouse, Elasticsearch):
127+
- If there is only one database available (DuckDB, Elasticsearch):
119128
you can use the tools from this database without the user specifying it.
120129
"""
121130
try:
@@ -126,7 +135,7 @@ def get_statistics_by_type_duckdb(record_type: RecordType | str) -> list[dict[st
126135

127136
@duckdb_reader_router.tool
128137
def get_trend_data_duckdb(
129-
record_type: RecordType | str,
138+
record_type: RecordType | WorkoutType | str,
130139
interval: IntervalType = "month",
131140
date_from: str | None = None,
132141
date_to: str | None = None,
@@ -166,14 +175,14 @@ def get_trend_data_duckdb(
166175
- IMPORTANT - interval must be one of: "day", "week", "month", or "year".
167176
Do not use other values.
168177
- Do not guess, autofill, or assume any missing data.
169-
- If there are multiple databases available (DuckDB, ClickHouse, Elasticsearch):
178+
- If there are multiple databases available (DuckDB, Elasticsearch):
170179
first, ask the user which one he wants to use. DO NOT call any tools before
171180
the user specifies his intent.
172181
- If the user decides on an option, only use tools from this database,
173182
do not switch over to another until the user specifies that he wants
174183
to use a different one. You do not have to keep asking whether
175184
the user wants to use the same database that he used before.
176-
- If there is only one database available (DuckDB, ClickHouse, Elasticsearch):
185+
- If there is only one database available (DuckDB, Elasticsearch):
177186
you can use the tools from this database without the user specifying it.
178187
"""
179188
try:
@@ -184,7 +193,7 @@ def get_trend_data_duckdb(
184193

185194
@duckdb_reader_router.tool
186195
def search_values_duckdb(
187-
record_type: RecordType | str | None,
196+
record_type: RecordType | WorkoutType | str | None,
188197
value: str,
189198
date_from: str | None = None,
190199
date_to: str | None = None,
@@ -204,14 +213,14 @@ def search_values_duckdb(
204213
records with the value of "HKCategoryValueSleepAnalysisAsleepDeep"
205214
- The function automatically handles date filtering if date_from/date_to are provided
206215
- Do not guess, autofill, or assume any missing data.
207-
- If there are multiple databases available (DuckDB, ClickHouse, Elasticsearch):
216+
- If there are multiple databases available (DuckDB, Elasticsearch):
208217
first, ask the user which one he wants to use. DO NOT call any tools before
209218
the user specifies his intent.
210219
- If the user decides on an option, only use tools from this database,
211220
do not switch over to another until the user specifies that he wants
212221
to use a different one. You do not have to keep asking whether
213222
the user wants to use the same database that he used before.
214-
- If there is only one database available (DuckDB, ClickHouse, Elasticsearch):
223+
- If there is only one database available (DuckDB, Elasticsearch):
215224
you can use the tools from this database without the user specifying it.
216225
"""
217226
try:

app/schemas/record.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,14 +22,27 @@
2222
"HKQuantityTypeIdentifierEnvironmentalAudioExposure",
2323
]
2424

25+
WorkoutType = Literal[
26+
"HKWorkoutActivityTypeRunning",
27+
"HKWorkoutActivityTypeWalking",
28+
"HKWorkoutActivityTypeHiking",
29+
"HKWorkoutActivityTypeTraditionalStrengthTraining",
30+
"HKWorkoutActivityTypeCycling",
31+
"HKWorkoutActivityTypeMixedMetabolicCardioTraining",
32+
"HKWorkoutActivityTypeHighIntensityIntervalTraining",
33+
"HKWorkoutActivityTypeHockey",
34+
]
35+
2536
IntervalType = Literal["day", "week", "month", "year"]
2637

2738

2839
class HealthRecordSearchParams(BaseModel):
29-
record_type: RecordType | str | None = None
40+
record_type: RecordType | WorkoutType | str | None = None
3041
source_name: str | None = None
3142
date_from: str | None = None
3243
date_to: str | None = None
44+
min_workout_duration: str | None = None
45+
max_workout_duration: str | None = None
3346
value_min: str | None = None
3447
value_max: str | None = None
3548
limit: int = 10

app/services/duckdb_client.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,11 @@ def __post_init__(self):
2424
else:
2525
self.path = Path(self.path)
2626

27-
if isinstance(self.path, Path) and not self.path.exists():
28-
raise FileNotFoundError(f"Parquet file not found: {self.path}")
29-
3027
@staticmethod
31-
def format_response(response: DuckDBPyRelation) -> list[dict[str, Any]]:
32-
return response.df().to_dict(orient="records")
28+
def format_response(
29+
response: DuckDBPyRelation | list[DuckDBPyRelation],
30+
) -> list[dict[str, Any]]:
31+
if isinstance(response, DuckDBPyRelation):
32+
return response.df().to_dict(orient="records")
33+
records = [record.df().to_dict(orient="records") for record in response]
34+
return sum(records, [])

0 commit comments

Comments
 (0)