/* Extended Module Player * Copyright (C) 2021-2022 Alice Rowan * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ /** * Report bugs to libxmp or to here: https://github.com/AliceLR/megazeuxtests */ #include "arc_unpack.h" #include #include #include /* #define ARC_DEBUG */ /* ARC method 0x08: read maximum code width from stream, but ignore it. */ #define ARC_IGNORE_CODE_IN_STREAM 0x7ffe /* Spark method 0xff: read maximum code width from stream. */ #define ARC_MAX_CODE_IN_STREAM 0x7fff #define ARC_NO_CODE 0xffffffffUL #define ARC_RESET_CODE 256 #define ARC_BUFFER_SIZE 8192 /* Buffer size for multi-stage compression. */ struct arc_code { arc_uint16 prev; arc_uint16 length; arc_uint8 value; }; struct arc_lookup { arc_uint16 value; arc_uint8 length; }; struct arc_huffman_index { arc_int16 value[2]; }; struct arc_data { /* RLE90. */ size_t rle_in; size_t rle_out; int in_rle_code; int last_byte; /* LZW and huffman. */ arc_uint32 codes_buffered[8]; unsigned buffered_pos; unsigned buffered_width; size_t lzw_bits_in; size_t lzw_in; size_t lzw_out; unsigned lzw_eof; unsigned max_code; unsigned first_code; unsigned next_code; unsigned current_width; unsigned init_width; unsigned max_width; unsigned continue_left; unsigned continue_code; arc_uint32 last_code; unsigned kwkwk; unsigned last_first_value; unsigned char *window; struct arc_code *tree; struct arc_lookup *huffman_lookup; struct arc_huffman_index *huffman_tree; unsigned num_huffman; }; static int arc_unpack_init(struct arc_data *arc, int init_width, int max_width, int is_dynamic) { arc->rle_out = 0; arc->rle_in = 0; arc->in_rle_code = 0; arc->last_byte = 0; arc->buffered_pos = 0; arc->buffered_width = 0; arc->lzw_bits_in = 0; arc->lzw_in = 0; arc->lzw_out = 0; arc->lzw_eof = 0; arc->max_code = (1 << max_width); arc->first_code = is_dynamic ? 257 : 256; arc->current_width = init_width; arc->init_width = init_width; arc->max_width = max_width; arc->continue_left = 0; arc->continue_code = 0; arc->last_code = ARC_NO_CODE; arc->last_first_value = 0; arc->kwkwk = 0; arc->window = NULL; arc->tree = NULL; arc->huffman_lookup = NULL; arc->huffman_tree = NULL; arc->num_huffman = 0; if(max_width) { size_t i; if(max_width < 9 || max_width > 16) return -1; arc->tree = (struct arc_code *)calloc(1 << max_width, sizeof(struct arc_code)); if(!arc->tree) return -1; for(i = 0; i < 256; i++) { struct arc_code *c = &(arc->tree[i]); c->prev = (arc_uint16)ARC_NO_CODE; c->length = 1; c->value = i; } arc->next_code = arc->first_code; } return 0; } static int arc_unpack_window(struct arc_data *arc, size_t window_size) { arc->window = (unsigned char *)malloc(window_size); if(!arc->window) return -1; return 0; } static void arc_unpack_free(struct arc_data *arc) { free(arc->window); free(arc->tree); free(arc->huffman_lookup); free(arc->huffman_tree); } static arc_uint32 arc_get_bytes(const unsigned char *pos, int num) { switch(num) { case 0: return 0; case 1: return pos[0]; case 2: return pos[0] | (pos[1] << 8UL); case 3: return pos[0] | (pos[1] << 8UL) | (pos[2] << 16UL); default: return pos[0] | (pos[1] << 8UL) | (pos[2] << 16UL) | (pos[3] << 24UL); } } static arc_int32 arc_read_bits(struct arc_data * ARC_RESTRICT arc, const unsigned char *src, size_t src_len, unsigned int num_bits) { arc_uint32 ret; if(arc->lzw_bits_in + num_bits > (src_len << 3)) { arc->lzw_bits_in = src_len << 3; arc->lzw_in = src_len; return -1; } ret = arc_get_bytes(src + arc->lzw_in, src_len - arc->lzw_in); ret = (ret >> (arc->lzw_bits_in & 7)) & (0xffffUL << num_bits >> 16); arc->lzw_bits_in += num_bits; arc->lzw_in = arc->lzw_bits_in >> 3; return ret; } static arc_uint32 arc_next_code(struct arc_data * ARC_RESTRICT arc, const unsigned char *src, size_t src_len) { /** * Codes are read 8 at a time in the original ARC/ArcFS/Spark software, * presumably to simplify file IO. This buffer needs to be simulated. * * When the code width changes, the extra buffered codes are discarded. * Despite this, the final number of codes won't always be a multiple of 8. */ if(arc->buffered_pos >= 8 || arc->buffered_width != arc->current_width) { size_t i; for(i = 0; i < 8; i++) { arc_int32 value = arc_read_bits(arc, src, src_len, arc->current_width); if(value < 0) break; arc->codes_buffered[i] = value; } for(; i < 8; i++) arc->codes_buffered[i] = ARC_NO_CODE; arc->buffered_pos = 0; arc->buffered_width = arc->current_width; } return arc->codes_buffered[arc->buffered_pos++]; } static void arc_unlzw_add(struct arc_data *arc) { if(arc->last_code != ARC_NO_CODE && arc->next_code < arc->max_code) { arc_uint32 len = arc->tree[arc->last_code].length; struct arc_code *e; e = &(arc->tree[arc->next_code++]); e->prev = arc->last_code; e->length = len ? len + 1 : 0; e->value = arc->last_first_value; /* Automatically expand width. */ if(arc->next_code >= (1U << arc->current_width) && arc->current_width < arc->max_width) { arc->current_width++; #ifdef ARC_DEBUG fprintf(stderr, "width expanded to %u\n", arc->current_width); #endif } } } static int arc_unlzw_get_length(const struct arc_data *arc, const struct arc_code *e) { unsigned length = 1; int code; if(e->length) return e->length; do { if(length >= arc->max_code) return 0; length++; code = e->prev; e = &(arc->tree[code]); } while(code >= 256); return length; } static int arc_unlzw_block(struct arc_data * ARC_RESTRICT arc, unsigned char * ARC_RESTRICT dest, size_t dest_len, const unsigned char *src, size_t src_len) { unsigned char *pos; struct arc_code *e; arc_uint16 start_code; arc_uint32 code; int len; int set_last_first; #ifdef ARC_DEBUG int num_debug = 0; #endif while(arc->lzw_out < dest_len) { /* Interrupted while writing out code? Resume output... */ if(arc->continue_code) { code = arc->continue_code; set_last_first = 0; goto continue_code; } code = arc_next_code(arc, src, src_len); if(code >= arc->max_code) { arc->lzw_eof = 1; break; } #ifdef ARC_DEBUG fprintf(stderr, "%04x ", code); num_debug++; if(!(num_debug & 15)) fprintf(stderr, "\n"); #endif if(code == ARC_RESET_CODE && arc->first_code == 257) { size_t i; /* Reset width for dynamic modes 8, 9, and 255. */ #ifdef ARC_DEBUG fprintf(stderr, "reset at size = %u codes\n", arc->next_code); #endif arc->next_code = arc->first_code; arc->current_width = arc->init_width; arc->last_code = ARC_NO_CODE; for(i = 256; i < arc->max_code; i++) arc->tree[i].length = 0; continue; } /* Add next code first to avoid KwKwK problem. */ if((unsigned)code == arc->next_code) { arc_unlzw_add(arc); arc->kwkwk = 1; } /* Emit code. */ set_last_first = 1; continue_code: start_code = code; e = &(arc->tree[code]); if(!arc->continue_code) { len = arc_unlzw_get_length(arc, e); if(!len) { #ifdef ARC_DEBUG fprintf(stderr, "failed to get length for %04xh (code count is %04xh)\n", code, arc->next_code); #endif return -1; } } else len = arc->continue_left; if((unsigned)len > dest_len - arc->lzw_out) { /* Calculate arc->continue_left, skip arc->continue_left, * emit remaining len from end of dest. */ arc_int32 num_emit = dest_len - arc->lzw_out; arc->continue_left = len - num_emit; arc->continue_code = code; for(; len > num_emit; len--) e = &(arc->tree[e->prev]); } else arc->continue_code = 0; pos = dest + arc->lzw_out + len - 1; arc->lzw_out += len; for(; len > 0; len--) { code = e->value; *(pos--) = code; e = &(arc->tree[e->prev]); } /* Only set this if this is the tail end of the chain, * i.e., the first section written. */ if(set_last_first) arc->last_first_value = code; if(arc->continue_code) return 0; if(!arc->kwkwk) arc_unlzw_add(arc); arc->last_code = start_code; arc->kwkwk = 0; } return 0; } static int arc_unrle90_block(struct arc_data * ARC_RESTRICT arc, unsigned char * ARC_RESTRICT dest, size_t dest_len, const unsigned char *src, size_t src_len) { size_t start; size_t len; size_t i; for(i = 0; i < src_len;) { if(arc->in_rle_code) { arc->in_rle_code = 0; if(i >= src_len) { #ifdef ARC_DEBUG fprintf(stderr, "end of input stream mid-code @ %zu\n", i); #endif return -1; } if(src[i] == 0) { if(arc->rle_out >= dest_len) { #ifdef ARC_DEBUG fprintf(stderr, "end of output stream @ %zu emitting 0x90\n", i); #endif return -1; } #ifdef ARC_DEBUG fprintf(stderr, "@ %zu: literal 0x90\n", i); #endif dest[arc->rle_out++] = 0x90; arc->last_byte = 0x90; } else { len = src[i] - 1; if(arc->rle_out + len > dest_len) { #ifdef ARC_DEBUG fprintf(stderr, "end of output stream @ %zu: run of %02xh times %zu\n", i, arc->last_byte, len); #endif return -1; } #ifdef ARC_DEBUG fprintf(stderr, "@ %zu: run of %02xh times %zu\n", i, arc->last_byte, len); #endif memset(dest + arc->rle_out, arc->last_byte, len); arc->rle_out += len; } i++; } start = i; while(i < src_len && src[i] != 0x90) i++; if(i > start) { len = i - start; if(len + arc->rle_out > dest_len) { #ifdef ARC_DEBUG fprintf(stderr, "end of output_stream @ %zu: block of length %zu\n", i, len); #endif /* In some uncommon cases, ArcFS seems to output extra data beyond the * expected end of the file when unpacking crunched files. In the few * that have CRCs, ignoring the extra data still passes the check. */ len = dest_len - arc->rle_out; if(!len) break; } #ifdef ARC_DEBUG fprintf(stderr, "@ %zu: block of length %zu\n", i, len); #endif memcpy(dest + arc->rle_out, src + start, len); arc->rle_out += len; arc->last_byte = src[i - 1]; } if(i < src_len && src[i] == 0x90) { arc->in_rle_code = 1; i++; } } arc->rle_in += i; return 0; } static int arc_unpack_rle90(unsigned char * ARC_RESTRICT dest, size_t dest_len, const unsigned char *src, size_t src_len) { struct arc_data arc; if(arc_unpack_init(&arc, 0, 0, 0) != 0) return -1; if(arc_unrle90_block(&arc, dest, dest_len, src, src_len) != 0) { #ifdef ARC_DEBUG fprintf(stderr, "arc_unrle90_block failed\n"); #endif goto err; } if(arc.rle_out != dest_len) { #ifdef ARC_DEBUG fprintf(stderr, "out %zu != buffer size %zu\n", arc.rle_out, dest_len); #endif goto err; } arc_unpack_free(&arc); return 0; err: arc_unpack_free(&arc); return -1; } static int arc_unpack_lzw(unsigned char * ARC_RESTRICT dest, size_t dest_len, const unsigned char *src, size_t src_len, int init_width, int max_width) { struct arc_data arc; int is_dynamic = (init_width != max_width); if(max_width == ARC_MAX_CODE_IN_STREAM) { if(src_len < 2) return -1; max_width = src[0]; src++; src_len--; if(max_width < 9 || max_width > 16) return -1; } if(arc_unpack_init(&arc, init_width, max_width, is_dynamic) != 0) return -1; if(arc_unlzw_block(&arc, dest, dest_len, src, src_len)) { #ifdef ARC_DEBUG fprintf(stderr, "arc_unlzw_block failed (%zu in, %zu out)\n", arc.lzw_in, arc.lzw_out); #endif goto err; } if(arc.lzw_out != dest_len) { #ifdef ARC_DEBUG fprintf(stderr, "out %zu != buffer size %zu\n", arc.lzw_out, dest_len); #endif goto err; } arc_unpack_free(&arc); return 0; err: arc_unpack_free(&arc); return -1; } static int arc_unpack_lzw_rle90(unsigned char * ARC_RESTRICT dest, size_t dest_len, const unsigned char *src, size_t src_len, int init_width, int max_width) { struct arc_data arc; int is_dynamic = (init_width != max_width); /* This is only used for Spark method 0xff, which doesn't use RLE. */ if(max_width == ARC_MAX_CODE_IN_STREAM) return -1; if(max_width == ARC_IGNORE_CODE_IN_STREAM) { if(src_len < 2) return -1; src++; src_len--; max_width = 12; } if(max_width < 9 || max_width > 16) return -1; if(arc_unpack_init(&arc, init_width, max_width, is_dynamic) != 0) return -1; if(arc_unpack_window(&arc, ARC_BUFFER_SIZE) != 0) goto err; while(arc.lzw_eof == 0) { arc.lzw_out = 0; if(arc_unlzw_block(&arc, arc.window, ARC_BUFFER_SIZE, src, src_len)) { #ifdef ARC_DEBUG fprintf(stderr, "arc_unlzw_block failed " "(%zu in, %zu out in buffer, %zu out in stream)\n", arc.lzw_in, arc.lzw_out, arc.rle_out); #endif goto err; } if(arc_unrle90_block(&arc, dest, dest_len, arc.window, arc.lzw_out)) { #ifdef ARC_DEBUG fprintf(stderr, "arc_unrle90_block failed (%zu in, %zu out)\n", arc.lzw_in, arc.rle_out); #endif goto err; } } if(arc.rle_out != dest_len) { #ifdef ARC_DEBUG fprintf(stderr, "out %zu != buffer size %zu\n", arc.rle_out, dest_len); #endif goto err; } arc_unpack_free(&arc); return 0; err: arc_unpack_free(&arc); return -1; } /** * Huffman decoding based on this blog post by Phaeron. * https://www.virtualdub.org/blog2/entry_345.html */ #define LOOKUP_BITS 11 #define LOOKUP_MASK ((1 << LOOKUP_BITS) - 1) #define HUFFMAN_TREE_MAX 256 static int arc_huffman_check_tree(const struct arc_huffman_index *tree) { /* Make sure the tree isn't garbage... */ const struct arc_huffman_index *e; arc_uint8 visited[HUFFMAN_TREE_MAX]; arc_uint8 stack[HUFFMAN_TREE_MAX]; int stack_pos = 1; size_t i; memset(visited, 0, sizeof(visited)); stack[0] = 0; while(stack_pos > 0) { i = stack[--stack_pos]; e = &(tree[i]); visited[i] = 1; if(e->value[0] >= 0) { if(visited[e->value[0]]) return -1; stack[stack_pos++] = e->value[0]; } if(e->value[1] >= 0) { if(visited[e->value[1]]) return -1; stack[stack_pos++] = e->value[1]; } } return 0; } static int arc_huffman_init(struct arc_data * ARC_RESTRICT arc, const unsigned char *src, size_t src_len) { size_t table_size = 1 << LOOKUP_BITS; size_t iter; size_t i; size_t j; if(src_len < 2) return -1; arc->num_huffman = src[0] | (src[1] << 8); if(!arc->num_huffman || arc->num_huffman > HUFFMAN_TREE_MAX) return -1; arc->lzw_in = 2UL + 4UL * arc->num_huffman; arc->lzw_bits_in = (arc->lzw_in << 3); if(arc->lzw_in > src_len) return -1; /* Precompute huffman tree and lookup table. */ arc->huffman_lookup = (struct arc_lookup *)calloc(table_size, sizeof(struct arc_lookup)); arc->huffman_tree = (struct arc_huffman_index *)malloc(arc->num_huffman * sizeof(struct arc_huffman_index)); for(i = 0; i < arc->num_huffman; i++) { struct arc_huffman_index *e = &(arc->huffman_tree[i]); e->value[0] = src[i * 4 + 2] | (src[i * 4 + 3] << 8); e->value[1] = src[i * 4 + 4] | (src[i * 4 + 5] << 8); if(e->value[0] >= (int)arc->num_huffman || e->value[1] >= (int)arc->num_huffman) return -1; } if(arc_huffman_check_tree(arc->huffman_tree) < 0) return -1; for(i = 0; i < table_size; i++) { int index = 0; int value = i; int bits; if(arc->huffman_lookup[i].length) continue; for(bits = 0; index >= 0 && bits < LOOKUP_BITS; bits++) { index = arc->huffman_tree[index].value[value & 1]; value >>= 1; } if(index >= 0) { arc->huffman_lookup[i].value = index; continue; } iter = 1 << bits; for(j = i; j < table_size; j += iter) { arc->huffman_lookup[j].value = ~index; arc->huffman_lookup[j].length = bits; } } return 0; } static int arc_huffman_read_bits(struct arc_data * ARC_RESTRICT arc, const unsigned char *src, size_t src_len) { struct arc_huffman_index *tree = arc->huffman_tree; struct arc_lookup *e; size_t peek; size_t bits_end; int index; if(arc->lzw_in >= src_len) return -1; /* Optimize short values with precomputed table. */ peek = arc_get_bytes(src + arc->lzw_in, src_len - arc->lzw_in) >> (arc->lzw_bits_in & 7); e = &(arc->huffman_lookup[peek & LOOKUP_MASK]); if(e->length) { arc->lzw_bits_in += e->length; arc->lzw_in = arc->lzw_bits_in >> 3; return e->value; } /* The table also allows skipping the first few bits of long codes. */ bits_end = (src_len << 3); arc->lzw_bits_in += LOOKUP_BITS; index = e->value; while(index >= 0 && arc->lzw_bits_in < bits_end) { /* Force unsigned here to avoid potential sign extensions. */ unsigned bit = (unsigned)src[arc->lzw_bits_in >> 3] >> (arc->lzw_bits_in & 7); arc->lzw_bits_in++; index = tree[index].value[bit & 1]; } arc->lzw_in = arc->lzw_bits_in >> 3; /* This translates truncated code indices to negative * values (i.e. failure), no check required. */ return ~index; } static int arc_unhuffman_block(struct arc_data * ARC_RESTRICT arc, unsigned char * ARC_RESTRICT dest, size_t dest_len, const unsigned char *src, size_t src_len) { while(arc->lzw_out < dest_len) { int value = arc_huffman_read_bits(arc, src, src_len); if(value >= 256) { /* End of stream code. */ arc->lzw_in = src_len; arc->lzw_eof = 1; return 0; } if(value < 0) return -1; dest[arc->lzw_out++] = value; } return 0; } static int arc_unpack_huffman_rle90(unsigned char * ARC_RESTRICT dest, size_t dest_len, const unsigned char *src, size_t src_len) { struct arc_data arc; if(arc_unpack_init(&arc, 0, 0, 0) != 0) return -1; if(arc_unpack_window(&arc, ARC_BUFFER_SIZE) != 0) goto err; if(arc_huffman_init(&arc, src, src_len) != 0) goto err; while(arc.lzw_eof == 0) { arc.lzw_out = 0; if(arc_unhuffman_block(&arc, arc.window, ARC_BUFFER_SIZE, src, src_len)) { #ifdef ARC_DEBUG fprintf(stderr, "arc_unhuffman_block failed " "(%zu in, %zu out in buffer, %zu out in stream)\n", arc.lzw_in, arc.lzw_out, arc.rle_out); #endif goto err; } if(arc_unrle90_block(&arc, dest, dest_len, arc.window, arc.lzw_out)) { #ifdef ARC_DEBUG fprintf(stderr, "arc_unrle90_block failed (%zu in, %zu out)\n", arc.lzw_in, arc.rle_out); #endif goto err; } } if(arc.rle_out != dest_len) { #ifdef ARC_DEBUG fprintf(stderr, "out %zu != buffer size %zu\n", arc.rle_out, dest_len); #endif goto err; } arc_unpack_free(&arc); return 0; err: arc_unpack_free(&arc); return -1; } const char *arc_unpack(unsigned char * ARC_RESTRICT dest, size_t dest_len, const unsigned char *src, size_t src_len, int method, int max_width) { switch(method & 0x7f) { case ARC_M_UNPACKED_OLD: case ARC_M_UNPACKED: /* Handle these somewhere that doesn't require an extra buffer. */ return "not packed"; case ARC_M_PACKED: /* RLE90 */ if(arc_unpack_rle90(dest, dest_len, src, src_len) < 0) return "failed unpack"; break; case ARC_M_SQUEEZED: /* RLE90 + Huffman coding */ if(arc_unpack_huffman_rle90(dest, dest_len, src, src_len) < 0) return "failed unsqueeze"; break; case ARC_M_CRUNCHED: /* RLE90 + LZW 9-12 bit dynamic */ if(max_width > 16) return "invalid uncrunch width"; if(max_width <= 0) max_width = ARC_IGNORE_CODE_IN_STREAM; if(arc_unpack_lzw_rle90(dest, dest_len, src, src_len, 9, max_width)) return "failed uncrunch"; break; case ARC_M_SQUASHED: /* LZW 9-13 bit dynamic */ if(arc_unpack_lzw(dest, dest_len, src, src_len, 9, 13)) return "failed unsquash"; break; case ARC_M_COMPRESSED: /* LZW 9-16 bit dynamic */ if(max_width > 16) return "invalid uncompress width"; if(max_width <= 0) max_width = ARC_MAX_CODE_IN_STREAM; if(arc_unpack_lzw(dest, dest_len, src, src_len, 9, max_width)) return "failed uncompress"; break; default: return "unsupported method"; } return NULL; }