vlib: add archive.tar module to enable reading of .tar ang .tar.gz files (#24995)

2025-09-13 14:32:26 +03:00 · 2025-07-30 10:11:41 -06:00 · 2025-07-30 10:11:41 -06:00 · a8d75c10b5
commit a8d75c10b5
parent b876644e82
13 changed files with 969 additions and 0 deletions
--- a/vlib/archive/README.md
+++ b/vlib/archive/README.md
@ -0,0 +1,3 @@
+## Description
+
+`archive` is a namespace for different archive formats like `tar` or `zip`.
--- a/vlib/archive/tar/README.md
+++ b/vlib/archive/tar/README.md
@ -0,0 +1,33 @@
+## Description
+
+`tar` is a module to access tar archives.
+
+Tape archives (tar) are a file format for storing a sequence of files that can be read and written 
+as streams. This module covers the reading of the basic sections of archives produced by GNU tools
+like Linux command `tar -xvf` but in memory instead modifing the filesystem. Parses directories, 
+files, and file's content and manage paths longer than 100 chars.
+
+### Read Efficiency
+
+An entire tar file can be read in memory or by chunks. Keeps in memory a single decompressed 
+[chunk](https://modules.vlang.io/compress.gzip.html#decompress_with_callback) of 32 KB at a time 
+and also keeps in memory a single tar block of 512 bytes at a time. Convert paths to strings until 
+needed and the user reader implementation can stop early the reading process.
+
+### Read Example
+
+The tar blocks are parsed and some fields are passed to `Reader` implemented methods.
+
+```v
+import os
+import archive.tar
+
+fn main() {
+	os.chdir(@VMODROOT) or {}
+	path := 'archive/tar/testdata/life.tar.gz'
+	reader := tar.new_debug_reader()
+	tar.read_tar_gz_file(path, reader)!
+}
+```
+Look also in `examples` folder the `tar_gz_reader.v` program.
+
--- a/vlib/archive/tar/reader.v
+++ b/vlib/archive/tar/reader.v
@ -0,0 +1,277 @@
+module tar
+
+import compress.gzip
+import os
+
+// read_tar_gz_file decompresses a given local file and reads all the blocks
+// with a given reader.
+pub fn read_tar_gz_file(path string, reader Reader) ! {
+	tar_gz := os.read_bytes(path)!
+	all_blocks := gzip.decompress(tar_gz)!
+	mut untar := Untar{
+		reader: reader
+	}
+	untar.read_all_blocks(all_blocks)!
+}
+
+// Read is used by Untar to call Reader implemented methods.
+// The implementor can read the block's `get_block_number()` and `get_path()`
+// and can set the field `stop_early` to true to suspend the reading.
+pub struct Read {
+mut:
+	block_number int
+	special      BlockSpecial
+	prefix_len   int
+	prefix_buf   [131]u8
+	separator    bool
+	path_len     int
+	path_buf     [100]u8
+
+	long_path &LongPath = unsafe { nil }
+pub mut:
+	stop_early bool
+}
+
+// set_short_path sets Read path with the tar block strings `prefix` and `path`.
+// Block's `prefix` C string max length is 131 but most of the time is 0.
+// Block's `path` C string max length is 100. Both `prefix` and `path` are
+// linked to a V string but converted until is needed, see `get_path()`.
+fn (mut b Read) set_short_path(buffer [512]u8, separator_after_prefix bool) {
+	// first check if TAR block has a prefix string (0 to 131 chars). The
+	// prefix will be other than '' the TAR block filepath len is > 100.
+	b.prefix_len = 0
+	for i := 345; i < 345 + 131; i++ {
+		letter := buffer[i]
+		if letter == 0 {
+			break // first 0 found means prefix C string is complete.
+		}
+		b.prefix_buf[b.prefix_len] = letter
+		b.prefix_len++
+	}
+
+	b.separator = separator_after_prefix
+
+	// most of the time there is path for blocks like dirs and regular files:
+	b.path_len = 0
+	for i := 0; i < 100; i++ {
+		letter := buffer[i]
+		if letter == 0 {
+			break // first 0 found means path C string is complete.
+		}
+		b.path_buf[b.path_len] = letter
+		b.path_len++
+	}
+}
+
+// set_long_path sets Read path with the long path reference.
+fn (mut b Read) set_long_path(long_path &LongPath) {
+	b.long_path = unsafe { long_path }
+}
+
+// get_path returns the path of this read. The path is valid for blocks of types
+// directory, file and file data.
+pub fn (b Read) get_path() string {
+	if b.long_path != unsafe { nil } {
+		return b.long_path.get_path()
+	}
+
+	mut str := []u8{}
+	if b.prefix_len > 0 {
+		str << b.prefix_buf[0..b.prefix_len]
+	}
+	if b.prefix_len > 0 && b.separator {
+		str << `/`
+	}
+	if b.path_len > 0 {
+		str << b.path_buf[0..b.path_len]
+	}
+	return str.bytestr()
+}
+
+// get_block_number returns the consecutive number of this read.
+pub fn (b Read) get_block_number() int {
+	return b.block_number
+}
+
+// get_special returns the special type of the Read.
+pub fn (b Read) get_special() BlockSpecial {
+	return b.special
+}
+
+// str returns a string representation with block number, path, special type and stop early.
+pub fn (r Read) str() string {
+	return '(block_number:${r.block_number} path:${r.get_path()} special:${r.special} stop_early:${r.stop_early})'
+}
+
+// Reader is used to read by Untar to parse the blocks.
+pub interface Reader {
+mut:
+	// dir_block is called when untar reads a block of type directory.
+	// Call `Read.get_path()` to get the full name of the directory.
+	// `size` field is zero for directories.
+	// The implementor can set Read's field `stop_early` to suspend the reader.
+	dir_block(mut read Read, size u64)
+
+	// file_block is called when untar reads a block of type filename.
+	// Call `Read.get_path()` to get the full name of the file.
+	// `size` is the expected file size in bytes to be read later.
+	// The implementor can set Read's field `stop_early` to suspend the reader.
+	file_block(mut read Read, size u64)
+
+	// file_block is called when untar reads a block of type filedata.
+	// Call `Read.get_path()` to get the full name of the file data belongs to.
+	// The `data` size is 512 bytes or less. `pending` indicates how many bytes are left to read.
+	// The implementor can inspect the data and use the pending value
+	// to set Read's field `stop_early` to suspend the reader.
+	data_block(mut read Read, data []u8, pending int)
+
+	// other_block is called when untar reads a block type other than directory,
+	// filename or filedata. `Read.get_header()` and 'details' give more info about the block.
+	// `block device` or `FIFO`.
+	// The implementor can set Read's field `stop_early` to suspend the reader.
+	other_block(mut read Read, details string)
+}
+
+// DebugReader implements a Reader and prints rows for blocks read
+// as directories, files, file data blocks and special blocks.
+pub struct DebugReader implements Reader {
+}
+
+// new_debug_reader returns a DebugReader
+pub fn new_debug_reader() &DebugReader {
+	return &DebugReader{}
+}
+
+fn (mut t DebugReader) dir_block(mut read Read, size u64) {
+	println('DIR   #${read.get_block_number()} ${read.get_path()}')
+}
+
+fn (mut t DebugReader) file_block(mut read Read, size u64) {
+	println('FILE  #${read.get_block_number()} path:${read.get_path()} size:${size}')
+}
+
+fn (mut t DebugReader) data_block(mut read Read, data []u8, pending int) {
+	println('DATA  #${read.get_block_number()} ${read.get_path()} size:${data.len} pending:${pending}')
+}
+
+fn (mut t DebugReader) other_block(mut read Read, details string) {
+	println('OTHER #${read.get_block_number()} special:${read.special} ${details}')
+}
+
+// ReadResult is returned by ReadResultFn
+pub enum ReadResult {
+	@continue
+	stop_early
+	end_of_file
+	end_archive
+	overflow
+}
+
+type ReadResultFn = fn (block []u8) !ReadResult
+
+@[heap]
+pub struct Decompressor {
+mut:
+	untar &Untar
+}
+
+// new_decompressor returns a Decompressor to decompress a tar.gz file
+// A given Untar with a registered Reader will read the blocks.
+pub fn new_decompresor(untar &Untar) &Decompressor {
+	return &Decompressor{
+		untar: untar
+	}
+}
+
+// read_all decompresses the given `tar_gz` array with all the tar blocks.
+// Then calls untar method `read_all` to read all the blocks at once.
+// A read result is returned which can be of the type stop early or an error.
+pub fn (mut d Decompressor) read_all(tar_gz []u8) !ReadResult {
+	all_blocks := gzip.decompress(tar_gz)!
+	return d.untar.read_all_blocks(all_blocks)!
+}
+
+// read_chunks decompresses the given `tar_gz` array by chunks of
+// 32768 bytes which can hold up to 64 tar blocks of 512 bytes each.
+// Then calls untar method read_block with ChunksReader dispatcher.
+// A read result is returned which can be of the type stop early or an error.
+pub fn (mut d Decompressor) read_chunks(tar_gz []u8) !ReadResult {
+	mut reader := &ChunksReader{
+		read_block_fn: d.untar.read_single_block
+	}
+	callback := fn (chunk []u8, mut reader ChunksReader) int {
+		result := reader.read_blocks(chunk)
+		if result == .continue {
+			return chunk.len // go for more
+		}
+		return 0 // suspend
+	}
+	gzip.decompress_with_callback(tar_gz, callback, reader) or {
+		if reader.result == .continue {
+			return err
+		}
+		return reader.result
+	}
+	return reader.result
+}
+
+// ChunkReader has a reusable fixed buffer with maximum length of decompressed chunk
+// of 32768 bytes plus a maximum previous pending tar block of 512 bytes.
+struct ChunksReader {
+mut:
+	read_block_fn  ReadResultFn = unsafe { nil }
+	buffer         [32768 + 512]u8
+	chunks_counter int
+	pending        int // position of the last not sent buffer byte
+	result         ReadResult
+}
+
+// read_blocks receives a chunk like those of 32k from a gzip decompressor. The chunk is
+// assumed to be a TAR archive section and is cut in 512 bytes blocks that are sent to
+// the untar reader one by one. The untar reader result informs this process to continue or
+// stop early. This process can keep in the buffer the remaining bytes of an incomplete
+// block and will be send to the untar reader prepended to a next chunk cuts.
+fn (mut d ChunksReader) read_blocks(chunk []u8) ReadResult {
+	d.chunks_counter++
+	total := d.pending + chunk.len
+	if total > d.buffer.len {
+		assert false, 'Should not occur buffer overflow ${total}'
+		return .overflow
+	}
+
+	// append new chunk after previous incomplete block bytes not sent yet
+	for i, ch in chunk {
+		d.buffer[i + d.pending] = ch
+	}
+	d.pending += chunk.len
+
+	mut cut := 0
+	for {
+		if cut + 512 > d.pending {
+			// after sending all complete blocks move the remaining not sent bytes
+			// to the start of the reused buffer to be prepended before next chunk
+			for i := cut; i < d.pending; i++ {
+				d.buffer[cut - 512] = d.buffer[i]
+			}
+			d.pending -= cut
+			return .continue
+		}
+
+		// send a complete block
+		block := d.buffer[cut..cut + 512]
+		cut += 512
+		d.result = d.read_block_fn(block) or {
+			assert false, 'Should not occur buffer overflow'
+			return .overflow
+		}
+		match d.result {
+			.continue {
+				// try next cut or leave a remaining
+			}
+			else {
+				break // untar error or stop_early
+			}
+		}
+	}
+	return d.result
+}
--- a/vlib/archive/tar/reader_test.v
+++ b/vlib/archive/tar/reader_test.v
@ -0,0 +1,157 @@
+module tar
+
+import os
+
+fn testsuite_begin() {
+	os.chdir(@VMODROOT) or {}
+}
+
+const testdata = 'vlib/archive/tar/testdata'
+
+// files copied from golang: https://github.com/golang/go/blob/master/src/archive/tar/testdata/file-and-dir.tar	
+fn test_golang_testdata() {
+	// [ ] dir       | 0 bytes | folder
+	// [ ] small.txt | 5 bytes | file
+	r1 := new_test_reader('file-and-dir.tar', false)!
+	assert r1.dirs[0] == 'dir/'
+	assert r1.files['small.txt'] == 5
+	assert r1.data['small.txt'] == 'Kilts'.bytes()
+	assert r1.other[0] == 'block:4 special:blank_1 continue'
+	assert r1.other[1] == 'block:5 special:blank_2 end_archive'
+
+	// [ ] small.txt  |  5 bytes | file
+	// [ ] small2.txt | 11 bytes | file
+	r2 := new_test_reader('gnu.tar', false)!
+	assert r2.dirs.len == 0
+	assert r2.files['small.txt'] == 5
+	assert r2.files['small2.txt'] == 11
+	assert r2.data['small.txt'] == 'Kilts'.bytes()
+	assert r2.data['small2.txt'] == 'Google.com\n'.bytes()
+
+	// [ ] h1<?><?><?><?>bye | 0 bytes
+	r3 := new_test_reader('gnu-not-utf8.tar', false)!
+	r3_filename := [u8(`h`), `i`, 0x80, 0x81, 0x82, 0x83, `b`, `y`, `e`].bytestr()
+	r3_file_len := r3.files[r3_filename] or { assert false, 'file not found: ${r3_filename}' }
+	assert r3_file_len == 0
+	assert r3.other.len == 2
+
+	// [ ] 0123456789 | 0 bytes
+	r4 := new_test_reader('gnu-long-nul.tar', false)!
+	assert r4.dirs.len == 0
+	r4_filename := '0123456789'
+	r4_file_len := r4.files[r4_filename] or {
+		assert false, 'file ${r4_filename} not found in ${r4.files.keys()}'
+	}
+	assert r4_file_len == 0
+	assert r4.other[0] == 'block:1 special:long_name size:161'
+	assert r4.other[1] == 'block:2 special:long_name data_part:161'
+
+	// [ ] ☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹ | 0 bytes
+	r5 := new_test_reader('gnu-utf8.tar', false)!
+	r5_filename := '☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹'
+	r5_file_len := r5.files[r5_filename] or { assert false, 'file not found: ${r5_filename}' }
+	assert r5_file_len == 0
+	assert r5.other[0] == 'block:1 special:long_name size:163'
+	assert r5.other[1] == 'block:2 special:long_name data_part:163'
+}
+
+fn test_long_long_short() {
+	// test long path (human) substitute another long path (chimp) then a normal path (cat)
+	r1 := new_test_reader_gz('life.tar.gz', false)!
+
+	mammal := 'life/Animalia/Chordata/Mammalia'
+	human := '${mammal}/Primates_Haplorhini_Simiiformes/Hominidae_Homininae_Hominini/Homo/Homo sapiens.txt'
+	chimp := '${mammal}/Primates_Haplorhini_Simiiformes/Hominidae_Homininae_Hominini/Pan/Pan troglodytes.txt'
+	cat := '${mammal}/Carnivora_Feliformia/Felidae_Felinae/Felis/Felis catus.txt'
+	assert human.len > 100
+	assert chimp.len > 100
+	assert cat.len <= 100
+	assert r1.files[human] == 35
+	assert r1.files[chimp] == 40
+	assert r1.files[cat] == 33
+	assert r1.texts[human] == 'https://en.wikipedia.org/wiki/Human'
+	assert r1.texts[chimp] == 'https://en.wikipedia.org/wiki/Chimpanzee'
+	assert r1.texts[cat] == 'https://en.wikipedia.org/wiki/Cat'
+}
+
+struct TestReader {
+	debug bool
+mut:
+	dirs  []string
+	files map[string]u64
+	data  map[string][]u8
+	texts map[string]string
+	other []string
+
+	last_file string
+	last_data []u8
+}
+
+// new_test_reader reads files *.tar
+fn new_test_reader(tar_file string, debug bool) !&TestReader {
+	mut reader := &TestReader{
+		debug: debug
+	}
+	mut untar := Untar{
+		reader: reader
+	}
+	all_blocks := os.read_bytes('${testdata}/${tar_file}')!
+	untar.read_all_blocks(all_blocks)!
+	return reader
+}
+
+// new_test_reader_gz reads files *.tar.gz
+fn new_test_reader_gz(tar_gz_file string, debug bool) !&TestReader {
+	mut reader := &TestReader{
+		debug: debug
+	}
+	mut untar := Untar{
+		reader: reader
+	}
+	mut decompressor := new_decompresor(untar)
+	tar_gz := os.read_bytes('${testdata}/${tar_gz_file}')!
+	decompressor.read_all(tar_gz)!
+
+	return reader
+}
+
+fn (mut t TestReader) dir_block(mut read Read, size u64) {
+	t.dirs << read.get_path()
+	if t.debug {
+		println('DIR   #${read.get_block_number()} ${read.get_path()}')
+	}
+}
+
+fn (mut t TestReader) file_block(mut read Read, size u64) {
+	t.last_file = read.get_path()
+	t.files[t.last_file] = size
+	if t.debug {
+		println('FILE  #${read.get_block_number()} ${read.get_path()}')
+	}
+}
+
+fn (mut t TestReader) data_block(mut read Read, data []u8, pending int) {
+	path := read.get_path()
+	if t.debug {
+		println('DATA  #${read.get_block_number()} ${path}')
+	}
+	if t.last_file == path {
+		t.last_data << data
+		if pending == 0 {
+			t.data[t.last_file] = t.last_data.clone()
+			t.texts[path] = t.last_data.bytestr()
+			if t.debug {
+				println('TEXT  #${read.get_block_number()} ${t.last_data.bytestr()}')
+			}
+			t.last_file = ''
+			t.last_data.clear()
+		}
+	}
+}
+
+fn (mut t TestReader) other_block(mut read Read, details string) {
+	t.other << 'block:${read.block_number} special:${read.special} ${details}'
+	if t.debug {
+		println('OTHER #${read.get_block_number()} special:${read.special} ${details}')
+	}
+}
--- a/vlib/archive/tar/tar.v
+++ b/vlib/archive/tar/tar.v
@ -0,0 +1,43 @@
+module tar
+
+// ustart header block octets
+// Field    | Offset | Length
+// --------------------------
+// name     |   0    | 100
+// mode     | 100    |   8
+// uid      | 108    |   8
+// gid      | 116    |   8
+// size     | 124    |  12
+// mtime    | 136    |  12
+// chksum   | 148    |   8
+// typeflag | 156    |   1
+// linkname | 157    | 100
+// magic    | 257    |   6
+// version  | 263    |   2
+// uname    | 265    |  32
+// gname    | 297    |  32
+// devmajor | 329    |   8
+// devminor | 337    |   8
+// prefix   | 345    | 155
+
+pub enum BlockHeader as u8 {
+	file      = u8(`0`) // 0x30
+	hard_link = u8(`1`) // 0x31
+	sym_link  = u8(`2`) // 0x32
+	char_dev  = u8(`3`) // 0x33
+	block_dev = u8(`4`) // 0x34
+	dir       = u8(`5`) // 0x35
+	fifo      = u8(`6`) // 0x36
+	long_name = u8(`L`) // 0x4c = 76 dec
+	global    = u8(`g`) // 0x67 pax
+}
+
+pub enum BlockSpecial {
+	no        // for headers `0`,`5` or data blocks
+	blank_1   // first blank block: continue
+	blank_2   // second blank block: end of archiv
+	ignore    // for headers `1`, `2`, `3`, `4`, `6`
+	long_name // for header `L`
+	global    // for header `g`
+	unknown   // for not header defined
+}
--- a/vlib/archive/tar/testdata/file-and-dir.tar
+++ b/vlib/archive/tar/testdata/file-and-dir.tar
--- a/vlib/archive/tar/testdata/gnu-long-nul.tar
+++ b/vlib/archive/tar/testdata/gnu-long-nul.tar
--- a/vlib/archive/tar/testdata/gnu-not-utf8.tar
+++ b/vlib/archive/tar/testdata/gnu-not-utf8.tar
--- a/vlib/archive/tar/testdata/gnu-utf8.tar
+++ b/vlib/archive/tar/testdata/gnu-utf8.tar
--- a/vlib/archive/tar/testdata/gnu.tar
+++ b/vlib/archive/tar/testdata/gnu.tar
--- a/vlib/archive/tar/testdata/life.tar.gz
+++ b/vlib/archive/tar/testdata/life.tar.gz
--- a/vlib/archive/tar/untar.v
+++ b/vlib/archive/tar/untar.v
@ -0,0 +1,290 @@
+module tar
+
+// Untar uses a reader to parse the contents of a unix tar file.
+// Reuses a fixed array of 512 bytes to parse each TAR block.
+@[heap]
+pub struct Untar {
+mut:
+	reader     Reader
+	max_blocks int
+	buffer     [512]u8 // data to parse block
+	read       Read    // last read to send/receive to/from reader implementation
+
+	state State // true when reading data blocks or long names
+	size  int   // remaining data size during state_data
+
+	long_path &LongPath = unsafe { nil } // not nil to hold a file long_name
+
+	blank_block int = -1 // last no-data block with all-zeros
+}
+
+enum State {
+	header
+	data
+	long_path
+}
+
+// new_untar builds a untar with a given Reader.
+pub fn new_untar(reader Reader) &Untar {
+	return &Untar{
+		reader: reader
+	}
+}
+
+// str returns a string representation with max_blocks and last read.
+pub fn (u Untar) str() string {
+	return 'max_blocks:${u.max_blocks} last_read:${u.read}'
+}
+
+// read_all_blocks parses the data blocks of any decompressed *.tar.gz array.
+// The data blocks length must be divisible by 512.
+pub fn (mut u Untar) read_all_blocks(blocks []u8) !ReadResult {
+	if blocks.len % 512 != 0 {
+		return error('data_blocks size is not a multiple of 512')
+	}
+	u.max_blocks = blocks.len / 512
+	for i := 0; i < blocks.len; i += 512 {
+		result := u.read_single_block(blocks[i..i + 512])!
+		if result != .continue {
+			return result
+		}
+	}
+	return .end_of_file
+}
+
+// read_single_block parses one data block at a time.
+// The data block length must be 512. Two consecutive no data blocks
+// have 512 zeroes returns a .end_archive result.
+pub fn (mut u Untar) read_single_block(block []u8) !ReadResult {
+	if block.len != 512 {
+		return error('data_block size is not 512')
+	}
+	u.read.block_number++ // 1,2,3...
+
+	mut is_blank_block := true
+	for i in 0 .. 512 {
+		u.buffer[i] = block[i]
+		if block[i] != 0 {
+			is_blank_block = false
+		}
+	}
+	match u.state {
+		.header {
+			if is_blank_block {
+				// current non-data block is a blank block
+				prev_block := u.read.block_number - 1
+				result := if u.blank_block == prev_block {
+					// two consecutive blank blocks
+					u.read.special = .blank_2
+					ReadResult.end_archive
+				} else {
+					// first blank block
+					u.read.special = .blank_1
+					ReadResult.continue
+				}
+				u.read.path_len = 0
+				u.reader.other_block(mut u.read, '${result}')
+				u.blank_block = u.read.block_number
+				return result
+			}
+			u.read_header()!
+		}
+		.data {
+			u.read_data()
+		}
+		.long_path {
+			u.read_long_path()
+		}
+	}
+	return if u.read.stop_early {
+		.stop_early
+	} else {
+		.continue
+	}
+}
+
+fn (mut u Untar) read_header() ! {
+	u.size = int(u.extract_octal(124, 12))
+	header := u.buffer[156] // pos 0x9c
+	block_header := BlockHeader.from(header) or {
+		u.read.special = .unknown
+		u.read.path_len = 0
+		u.reader.other_block(mut u.read, 'size:${u.size}')
+		return
+	}
+	match block_header {
+		.dir {
+			if !u.checksum_ok() {
+				return error('Checksum error: directory reading:${u.read}')
+			}
+			u.read.special = .no
+			u.read.set_short_path(u.buffer, false)
+			u.reader.dir_block(mut u.read, u64(u.size))
+			// u.state = .header
+		}
+		.file {
+			if !u.checksum_ok() {
+				return error('Checksum error file reading:${u.read}')
+			}
+			u.read.special = .no
+			if u.long_path != unsafe { nil } {
+				u.read.set_long_path(u.long_path)
+				if u.size > 0 {
+					u.state = .data
+				}
+			} else {
+				u.read.set_short_path(u.buffer, true)
+				if u.size > 0 {
+					u.state = .data
+				}
+			}
+			u.reader.file_block(mut u.read, u64(u.size))
+		}
+		.long_name {
+			u.read.special = .long_name
+			u.reader.other_block(mut u.read, 'size:${u.size}')
+			if u.size > 0 {
+				u.state = .long_path
+				u.long_path = new_long_path(u.size)
+			}
+		}
+		.hard_link, .sym_link, .char_dev, .block_dev, .fifo {
+			u.read.special = .ignore
+			u.reader.other_block(mut u.read, block_header.str())
+		}
+		.global {
+			u.read.special = .global
+			u.read.set_short_path(u.buffer, false)
+			u.reader.other_block(mut u.read, 'size:${u.size}')
+			if u.size > 0 {
+				u.state = .data
+			}
+		}
+	}
+}
+
+// reader_data calls Reader.data_block for implementor to collect data parts as file content
+fn (mut u Untar) read_data() {
+	if u.size > 0 {
+		part := if u.size > 512 { 512 } else { u.size }
+		u.size -= 512
+		pending := if u.size > 0 { u.size } else { 0 }
+		data_part := u.buffer[0..part]
+		u.reader.data_block(mut u.read, data_part, pending)
+	}
+	if u.size <= 0 {
+		u.long_path = unsafe { nil }
+		u.read.long_path = unsafe { nil } // real clear
+		u.state = .header
+	}
+}
+
+fn (mut u Untar) read_long_path() {
+	if u.size > 0 {
+		part := if u.size > 512 { 512 } else { u.size }
+		u.size -= 512
+		data_part := u.buffer[0..part]
+		if u.long_path != unsafe { nil } {
+			// this long path field collects the data parts as file long name
+			u.long_path.append(data_part)
+			u.reader.other_block(mut u.read, 'data_part:${data_part.len}')
+		}
+	}
+	if u.size <= 0 {
+		u.state = .header
+	}
+}
+
+// extract_path returns the block path for directories and files.
+fn (mut u Untar) extract_path() string {
+	mut name := []u8{}
+	mut i := 0
+	for {
+		if i >= u.buffer.len {
+			break
+		}
+		letter := u.buffer[i]
+		if letter == 0 {
+			break
+		}
+		name << letter
+		i++
+	}
+	return name.bytestr()
+}
+
+// checksum_ok verifies the validity for dir and files blocks.
+fn (mut u Untar) checksum_ok() bool {
+	mut v := u64(0)
+	for n := 0; n < 512; n++ {
+		if n < 148 || n > 155 {
+			v += u.buffer[n]
+		} else {
+			v += 0x20
+		}
+	}
+	parse := u.extract_octal(148, 8)
+	return v == parse
+}
+
+// extract_octal reads an octal number at block position `pos` with a given number of `digits`.
+fn (mut u Untar) extract_octal(pos int, digits int) u64 {
+	mut i := u64(0)
+	mut p := pos
+	mut n := digits
+	for {
+		if (u.buffer[p] < `0` || u.buffer[p] > `7`) && n > 0 {
+			p++
+			n--
+		} else {
+			break
+		}
+	}
+	for {
+		if u.buffer[p] >= `0` && u.buffer[p] <= `7` && n > 0 {
+			i *= 8
+			i += u8(u.buffer[p] - `0`)
+			p++
+			n--
+		} else {
+			break
+		}
+	}
+	return i
+}
+
+@[heap]
+struct LongPath {
+mut:
+	name     []u8
+	last_pos int
+}
+
+// new_long_path builds a LongPath with a fixed maximum name size
+fn new_long_path(size int) &LongPath {
+	return &LongPath{
+		name: []u8{len: size}
+	}
+}
+
+// appends copies the data to the
+fn (mut l LongPath) append(data []u8) {
+	if l.name.len >= l.last_pos + data.len {
+		for i, d in data {
+			l.name[l.last_pos + i] = d
+		}
+		l.last_pos += data.len
+	}
+}
+
+// get_path returns the string from name appended as C string.
+fn (l LongPath) get_path() string {
+	mut s := []u8{}
+	for n in l.name {
+		if n == 0 {
+			break
+		}
+		s << n
+	}
+	return s.bytestr()
+}