vlib: add archive.tar module to enable reading of .tar ang .tar.gz files (#24995)

This commit is contained in:
Jorge Mireles 2025-07-30 10:11:41 -06:00 committed by GitHub
parent b876644e82
commit a8d75c10b5
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
13 changed files with 969 additions and 0 deletions

3
vlib/archive/README.md Normal file
View file

@ -0,0 +1,3 @@
## Description
`archive` is a namespace for different archive formats like `tar` or `zip`.

View file

@ -0,0 +1,33 @@
## Description
`tar` is a module to access tar archives.
Tape archives (tar) are a file format for storing a sequence of files that can be read and written
as streams. This module covers the reading of the basic sections of archives produced by GNU tools
like Linux command `tar -xvf` but in memory instead modifing the filesystem. Parses directories,
files, and file's content and manage paths longer than 100 chars.
### Read Efficiency
An entire tar file can be read in memory or by chunks. Keeps in memory a single decompressed
[chunk](https://modules.vlang.io/compress.gzip.html#decompress_with_callback) of 32 KB at a time
and also keeps in memory a single tar block of 512 bytes at a time. Convert paths to strings until
needed and the user reader implementation can stop early the reading process.
### Read Example
The tar blocks are parsed and some fields are passed to `Reader` implemented methods.
```v
import os
import archive.tar
fn main() {
os.chdir(@VMODROOT) or {}
path := 'archive/tar/testdata/life.tar.gz'
reader := tar.new_debug_reader()
tar.read_tar_gz_file(path, reader)!
}
```
Look also in `examples` folder the `tar_gz_reader.v` program.

277
vlib/archive/tar/reader.v Normal file
View file

@ -0,0 +1,277 @@
module tar
import compress.gzip
import os
// read_tar_gz_file decompresses a given local file and reads all the blocks
// with a given reader.
pub fn read_tar_gz_file(path string, reader Reader) ! {
tar_gz := os.read_bytes(path)!
all_blocks := gzip.decompress(tar_gz)!
mut untar := Untar{
reader: reader
}
untar.read_all_blocks(all_blocks)!
}
// Read is used by Untar to call Reader implemented methods.
// The implementor can read the block's `get_block_number()` and `get_path()`
// and can set the field `stop_early` to true to suspend the reading.
pub struct Read {
mut:
block_number int
special BlockSpecial
prefix_len int
prefix_buf [131]u8
separator bool
path_len int
path_buf [100]u8
long_path &LongPath = unsafe { nil }
pub mut:
stop_early bool
}
// set_short_path sets Read path with the tar block strings `prefix` and `path`.
// Block's `prefix` C string max length is 131 but most of the time is 0.
// Block's `path` C string max length is 100. Both `prefix` and `path` are
// linked to a V string but converted until is needed, see `get_path()`.
fn (mut b Read) set_short_path(buffer [512]u8, separator_after_prefix bool) {
// first check if TAR block has a prefix string (0 to 131 chars). The
// prefix will be other than '' the TAR block filepath len is > 100.
b.prefix_len = 0
for i := 345; i < 345 + 131; i++ {
letter := buffer[i]
if letter == 0 {
break // first 0 found means prefix C string is complete.
}
b.prefix_buf[b.prefix_len] = letter
b.prefix_len++
}
b.separator = separator_after_prefix
// most of the time there is path for blocks like dirs and regular files:
b.path_len = 0
for i := 0; i < 100; i++ {
letter := buffer[i]
if letter == 0 {
break // first 0 found means path C string is complete.
}
b.path_buf[b.path_len] = letter
b.path_len++
}
}
// set_long_path sets Read path with the long path reference.
fn (mut b Read) set_long_path(long_path &LongPath) {
b.long_path = unsafe { long_path }
}
// get_path returns the path of this read. The path is valid for blocks of types
// directory, file and file data.
pub fn (b Read) get_path() string {
if b.long_path != unsafe { nil } {
return b.long_path.get_path()
}
mut str := []u8{}
if b.prefix_len > 0 {
str << b.prefix_buf[0..b.prefix_len]
}
if b.prefix_len > 0 && b.separator {
str << `/`
}
if b.path_len > 0 {
str << b.path_buf[0..b.path_len]
}
return str.bytestr()
}
// get_block_number returns the consecutive number of this read.
pub fn (b Read) get_block_number() int {
return b.block_number
}
// get_special returns the special type of the Read.
pub fn (b Read) get_special() BlockSpecial {
return b.special
}
// str returns a string representation with block number, path, special type and stop early.
pub fn (r Read) str() string {
return '(block_number:${r.block_number} path:${r.get_path()} special:${r.special} stop_early:${r.stop_early})'
}
// Reader is used to read by Untar to parse the blocks.
pub interface Reader {
mut:
// dir_block is called when untar reads a block of type directory.
// Call `Read.get_path()` to get the full name of the directory.
// `size` field is zero for directories.
// The implementor can set Read's field `stop_early` to suspend the reader.
dir_block(mut read Read, size u64)
// file_block is called when untar reads a block of type filename.
// Call `Read.get_path()` to get the full name of the file.
// `size` is the expected file size in bytes to be read later.
// The implementor can set Read's field `stop_early` to suspend the reader.
file_block(mut read Read, size u64)
// file_block is called when untar reads a block of type filedata.
// Call `Read.get_path()` to get the full name of the file data belongs to.
// The `data` size is 512 bytes or less. `pending` indicates how many bytes are left to read.
// The implementor can inspect the data and use the pending value
// to set Read's field `stop_early` to suspend the reader.
data_block(mut read Read, data []u8, pending int)
// other_block is called when untar reads a block type other than directory,
// filename or filedata. `Read.get_header()` and 'details' give more info about the block.
// `block device` or `FIFO`.
// The implementor can set Read's field `stop_early` to suspend the reader.
other_block(mut read Read, details string)
}
// DebugReader implements a Reader and prints rows for blocks read
// as directories, files, file data blocks and special blocks.
pub struct DebugReader implements Reader {
}
// new_debug_reader returns a DebugReader
pub fn new_debug_reader() &DebugReader {
return &DebugReader{}
}
fn (mut t DebugReader) dir_block(mut read Read, size u64) {
println('DIR #${read.get_block_number()} ${read.get_path()}')
}
fn (mut t DebugReader) file_block(mut read Read, size u64) {
println('FILE #${read.get_block_number()} path:${read.get_path()} size:${size}')
}
fn (mut t DebugReader) data_block(mut read Read, data []u8, pending int) {
println('DATA #${read.get_block_number()} ${read.get_path()} size:${data.len} pending:${pending}')
}
fn (mut t DebugReader) other_block(mut read Read, details string) {
println('OTHER #${read.get_block_number()} special:${read.special} ${details}')
}
// ReadResult is returned by ReadResultFn
pub enum ReadResult {
@continue
stop_early
end_of_file
end_archive
overflow
}
type ReadResultFn = fn (block []u8) !ReadResult
@[heap]
pub struct Decompressor {
mut:
untar &Untar
}
// new_decompressor returns a Decompressor to decompress a tar.gz file
// A given Untar with a registered Reader will read the blocks.
pub fn new_decompresor(untar &Untar) &Decompressor {
return &Decompressor{
untar: untar
}
}
// read_all decompresses the given `tar_gz` array with all the tar blocks.
// Then calls untar method `read_all` to read all the blocks at once.
// A read result is returned which can be of the type stop early or an error.
pub fn (mut d Decompressor) read_all(tar_gz []u8) !ReadResult {
all_blocks := gzip.decompress(tar_gz)!
return d.untar.read_all_blocks(all_blocks)!
}
// read_chunks decompresses the given `tar_gz` array by chunks of
// 32768 bytes which can hold up to 64 tar blocks of 512 bytes each.
// Then calls untar method read_block with ChunksReader dispatcher.
// A read result is returned which can be of the type stop early or an error.
pub fn (mut d Decompressor) read_chunks(tar_gz []u8) !ReadResult {
mut reader := &ChunksReader{
read_block_fn: d.untar.read_single_block
}
callback := fn (chunk []u8, mut reader ChunksReader) int {
result := reader.read_blocks(chunk)
if result == .continue {
return chunk.len // go for more
}
return 0 // suspend
}
gzip.decompress_with_callback(tar_gz, callback, reader) or {
if reader.result == .continue {
return err
}
return reader.result
}
return reader.result
}
// ChunkReader has a reusable fixed buffer with maximum length of decompressed chunk
// of 32768 bytes plus a maximum previous pending tar block of 512 bytes.
struct ChunksReader {
mut:
read_block_fn ReadResultFn = unsafe { nil }
buffer [32768 + 512]u8
chunks_counter int
pending int // position of the last not sent buffer byte
result ReadResult
}
// read_blocks receives a chunk like those of 32k from a gzip decompressor. The chunk is
// assumed to be a TAR archive section and is cut in 512 bytes blocks that are sent to
// the untar reader one by one. The untar reader result informs this process to continue or
// stop early. This process can keep in the buffer the remaining bytes of an incomplete
// block and will be send to the untar reader prepended to a next chunk cuts.
fn (mut d ChunksReader) read_blocks(chunk []u8) ReadResult {
d.chunks_counter++
total := d.pending + chunk.len
if total > d.buffer.len {
assert false, 'Should not occur buffer overflow ${total}'
return .overflow
}
// append new chunk after previous incomplete block bytes not sent yet
for i, ch in chunk {
d.buffer[i + d.pending] = ch
}
d.pending += chunk.len
mut cut := 0
for {
if cut + 512 > d.pending {
// after sending all complete blocks move the remaining not sent bytes
// to the start of the reused buffer to be prepended before next chunk
for i := cut; i < d.pending; i++ {
d.buffer[cut - 512] = d.buffer[i]
}
d.pending -= cut
return .continue
}
// send a complete block
block := d.buffer[cut..cut + 512]
cut += 512
d.result = d.read_block_fn(block) or {
assert false, 'Should not occur buffer overflow'
return .overflow
}
match d.result {
.continue {
// try next cut or leave a remaining
}
else {
break // untar error or stop_early
}
}
}
return d.result
}

View file

@ -0,0 +1,157 @@
module tar
import os
fn testsuite_begin() {
os.chdir(@VMODROOT) or {}
}
const testdata = 'vlib/archive/tar/testdata'
// files copied from golang: https://github.com/golang/go/blob/master/src/archive/tar/testdata/file-and-dir.tar
fn test_golang_testdata() {
// [ ] dir | 0 bytes | folder
// [ ] small.txt | 5 bytes | file
r1 := new_test_reader('file-and-dir.tar', false)!
assert r1.dirs[0] == 'dir/'
assert r1.files['small.txt'] == 5
assert r1.data['small.txt'] == 'Kilts'.bytes()
assert r1.other[0] == 'block:4 special:blank_1 continue'
assert r1.other[1] == 'block:5 special:blank_2 end_archive'
// [ ] small.txt | 5 bytes | file
// [ ] small2.txt | 11 bytes | file
r2 := new_test_reader('gnu.tar', false)!
assert r2.dirs.len == 0
assert r2.files['small.txt'] == 5
assert r2.files['small2.txt'] == 11
assert r2.data['small.txt'] == 'Kilts'.bytes()
assert r2.data['small2.txt'] == 'Google.com\n'.bytes()
// [ ] h1<?><?><?><?>bye | 0 bytes
r3 := new_test_reader('gnu-not-utf8.tar', false)!
r3_filename := [u8(`h`), `i`, 0x80, 0x81, 0x82, 0x83, `b`, `y`, `e`].bytestr()
r3_file_len := r3.files[r3_filename] or { assert false, 'file not found: ${r3_filename}' }
assert r3_file_len == 0
assert r3.other.len == 2
// [ ] 0123456789 | 0 bytes
r4 := new_test_reader('gnu-long-nul.tar', false)!
assert r4.dirs.len == 0
r4_filename := '0123456789'
r4_file_len := r4.files[r4_filename] or {
assert false, 'file ${r4_filename} not found in ${r4.files.keys()}'
}
assert r4_file_len == 0
assert r4.other[0] == 'block:1 special:long_name size:161'
assert r4.other[1] == 'block:2 special:long_name data_part:161'
// [ ] ☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹☺☻☹ | 0 bytes
r5 := new_test_reader('gnu-utf8.tar', false)!
r5_filename := ''
r5_file_len := r5.files[r5_filename] or { assert false, 'file not found: ${r5_filename}' }
assert r5_file_len == 0
assert r5.other[0] == 'block:1 special:long_name size:163'
assert r5.other[1] == 'block:2 special:long_name data_part:163'
}
fn test_long_long_short() {
// test long path (human) substitute another long path (chimp) then a normal path (cat)
r1 := new_test_reader_gz('life.tar.gz', false)!
mammal := 'life/Animalia/Chordata/Mammalia'
human := '${mammal}/Primates_Haplorhini_Simiiformes/Hominidae_Homininae_Hominini/Homo/Homo sapiens.txt'
chimp := '${mammal}/Primates_Haplorhini_Simiiformes/Hominidae_Homininae_Hominini/Pan/Pan troglodytes.txt'
cat := '${mammal}/Carnivora_Feliformia/Felidae_Felinae/Felis/Felis catus.txt'
assert human.len > 100
assert chimp.len > 100
assert cat.len <= 100
assert r1.files[human] == 35
assert r1.files[chimp] == 40
assert r1.files[cat] == 33
assert r1.texts[human] == 'https://en.wikipedia.org/wiki/Human'
assert r1.texts[chimp] == 'https://en.wikipedia.org/wiki/Chimpanzee'
assert r1.texts[cat] == 'https://en.wikipedia.org/wiki/Cat'
}
struct TestReader {
debug bool
mut:
dirs []string
files map[string]u64
data map[string][]u8
texts map[string]string
other []string
last_file string
last_data []u8
}
// new_test_reader reads files *.tar
fn new_test_reader(tar_file string, debug bool) !&TestReader {
mut reader := &TestReader{
debug: debug
}
mut untar := Untar{
reader: reader
}
all_blocks := os.read_bytes('${testdata}/${tar_file}')!
untar.read_all_blocks(all_blocks)!
return reader
}
// new_test_reader_gz reads files *.tar.gz
fn new_test_reader_gz(tar_gz_file string, debug bool) !&TestReader {
mut reader := &TestReader{
debug: debug
}
mut untar := Untar{
reader: reader
}
mut decompressor := new_decompresor(untar)
tar_gz := os.read_bytes('${testdata}/${tar_gz_file}')!
decompressor.read_all(tar_gz)!
return reader
}
fn (mut t TestReader) dir_block(mut read Read, size u64) {
t.dirs << read.get_path()
if t.debug {
println('DIR #${read.get_block_number()} ${read.get_path()}')
}
}
fn (mut t TestReader) file_block(mut read Read, size u64) {
t.last_file = read.get_path()
t.files[t.last_file] = size
if t.debug {
println('FILE #${read.get_block_number()} ${read.get_path()}')
}
}
fn (mut t TestReader) data_block(mut read Read, data []u8, pending int) {
path := read.get_path()
if t.debug {
println('DATA #${read.get_block_number()} ${path}')
}
if t.last_file == path {
t.last_data << data
if pending == 0 {
t.data[t.last_file] = t.last_data.clone()
t.texts[path] = t.last_data.bytestr()
if t.debug {
println('TEXT #${read.get_block_number()} ${t.last_data.bytestr()}')
}
t.last_file = ''
t.last_data.clear()
}
}
}
fn (mut t TestReader) other_block(mut read Read, details string) {
t.other << 'block:${read.block_number} special:${read.special} ${details}'
if t.debug {
println('OTHER #${read.get_block_number()} special:${read.special} ${details}')
}
}

43
vlib/archive/tar/tar.v Normal file
View file

@ -0,0 +1,43 @@
module tar
// ustart header block octets
// Field | Offset | Length
// --------------------------
// name | 0 | 100
// mode | 100 | 8
// uid | 108 | 8
// gid | 116 | 8
// size | 124 | 12
// mtime | 136 | 12
// chksum | 148 | 8
// typeflag | 156 | 1
// linkname | 157 | 100
// magic | 257 | 6
// version | 263 | 2
// uname | 265 | 32
// gname | 297 | 32
// devmajor | 329 | 8
// devminor | 337 | 8
// prefix | 345 | 155
pub enum BlockHeader as u8 {
file = u8(`0`) // 0x30
hard_link = u8(`1`) // 0x31
sym_link = u8(`2`) // 0x32
char_dev = u8(`3`) // 0x33
block_dev = u8(`4`) // 0x34
dir = u8(`5`) // 0x35
fifo = u8(`6`) // 0x36
long_name = u8(`L`) // 0x4c = 76 dec
global = u8(`g`) // 0x67 pax
}
pub enum BlockSpecial {
no // for headers `0`,`5` or data blocks
blank_1 // first blank block: continue
blank_2 // second blank block: end of archiv
ignore // for headers `1`, `2`, `3`, `4`, `6`
long_name // for header `L`
global // for header `g`
unknown // for not header defined
}

Binary file not shown.

Binary file not shown.

Binary file not shown.

BIN
vlib/archive/tar/testdata/gnu-utf8.tar vendored Normal file

Binary file not shown.

BIN
vlib/archive/tar/testdata/gnu.tar vendored Normal file

Binary file not shown.

BIN
vlib/archive/tar/testdata/life.tar.gz vendored Normal file

Binary file not shown.

290
vlib/archive/tar/untar.v Normal file
View file

@ -0,0 +1,290 @@
module tar
// Untar uses a reader to parse the contents of a unix tar file.
// Reuses a fixed array of 512 bytes to parse each TAR block.
@[heap]
pub struct Untar {
mut:
reader Reader
max_blocks int
buffer [512]u8 // data to parse block
read Read // last read to send/receive to/from reader implementation
state State // true when reading data blocks or long names
size int // remaining data size during state_data
long_path &LongPath = unsafe { nil } // not nil to hold a file long_name
blank_block int = -1 // last no-data block with all-zeros
}
enum State {
header
data
long_path
}
// new_untar builds a untar with a given Reader.
pub fn new_untar(reader Reader) &Untar {
return &Untar{
reader: reader
}
}
// str returns a string representation with max_blocks and last read.
pub fn (u Untar) str() string {
return 'max_blocks:${u.max_blocks} last_read:${u.read}'
}
// read_all_blocks parses the data blocks of any decompressed *.tar.gz array.
// The data blocks length must be divisible by 512.
pub fn (mut u Untar) read_all_blocks(blocks []u8) !ReadResult {
if blocks.len % 512 != 0 {
return error('data_blocks size is not a multiple of 512')
}
u.max_blocks = blocks.len / 512
for i := 0; i < blocks.len; i += 512 {
result := u.read_single_block(blocks[i..i + 512])!
if result != .continue {
return result
}
}
return .end_of_file
}
// read_single_block parses one data block at a time.
// The data block length must be 512. Two consecutive no data blocks
// have 512 zeroes returns a .end_archive result.
pub fn (mut u Untar) read_single_block(block []u8) !ReadResult {
if block.len != 512 {
return error('data_block size is not 512')
}
u.read.block_number++ // 1,2,3...
mut is_blank_block := true
for i in 0 .. 512 {
u.buffer[i] = block[i]
if block[i] != 0 {
is_blank_block = false
}
}
match u.state {
.header {
if is_blank_block {
// current non-data block is a blank block
prev_block := u.read.block_number - 1
result := if u.blank_block == prev_block {
// two consecutive blank blocks
u.read.special = .blank_2
ReadResult.end_archive
} else {
// first blank block
u.read.special = .blank_1
ReadResult.continue
}
u.read.path_len = 0
u.reader.other_block(mut u.read, '${result}')
u.blank_block = u.read.block_number
return result
}
u.read_header()!
}
.data {
u.read_data()
}
.long_path {
u.read_long_path()
}
}
return if u.read.stop_early {
.stop_early
} else {
.continue
}
}
fn (mut u Untar) read_header() ! {
u.size = int(u.extract_octal(124, 12))
header := u.buffer[156] // pos 0x9c
block_header := BlockHeader.from(header) or {
u.read.special = .unknown
u.read.path_len = 0
u.reader.other_block(mut u.read, 'size:${u.size}')
return
}
match block_header {
.dir {
if !u.checksum_ok() {
return error('Checksum error: directory reading:${u.read}')
}
u.read.special = .no
u.read.set_short_path(u.buffer, false)
u.reader.dir_block(mut u.read, u64(u.size))
// u.state = .header
}
.file {
if !u.checksum_ok() {
return error('Checksum error file reading:${u.read}')
}
u.read.special = .no
if u.long_path != unsafe { nil } {
u.read.set_long_path(u.long_path)
if u.size > 0 {
u.state = .data
}
} else {
u.read.set_short_path(u.buffer, true)
if u.size > 0 {
u.state = .data
}
}
u.reader.file_block(mut u.read, u64(u.size))
}
.long_name {
u.read.special = .long_name
u.reader.other_block(mut u.read, 'size:${u.size}')
if u.size > 0 {
u.state = .long_path
u.long_path = new_long_path(u.size)
}
}
.hard_link, .sym_link, .char_dev, .block_dev, .fifo {
u.read.special = .ignore
u.reader.other_block(mut u.read, block_header.str())
}
.global {
u.read.special = .global
u.read.set_short_path(u.buffer, false)
u.reader.other_block(mut u.read, 'size:${u.size}')
if u.size > 0 {
u.state = .data
}
}
}
}
// reader_data calls Reader.data_block for implementor to collect data parts as file content
fn (mut u Untar) read_data() {
if u.size > 0 {
part := if u.size > 512 { 512 } else { u.size }
u.size -= 512
pending := if u.size > 0 { u.size } else { 0 }
data_part := u.buffer[0..part]
u.reader.data_block(mut u.read, data_part, pending)
}
if u.size <= 0 {
u.long_path = unsafe { nil }
u.read.long_path = unsafe { nil } // real clear
u.state = .header
}
}
fn (mut u Untar) read_long_path() {
if u.size > 0 {
part := if u.size > 512 { 512 } else { u.size }
u.size -= 512
data_part := u.buffer[0..part]
if u.long_path != unsafe { nil } {
// this long path field collects the data parts as file long name
u.long_path.append(data_part)
u.reader.other_block(mut u.read, 'data_part:${data_part.len}')
}
}
if u.size <= 0 {
u.state = .header
}
}
// extract_path returns the block path for directories and files.
fn (mut u Untar) extract_path() string {
mut name := []u8{}
mut i := 0
for {
if i >= u.buffer.len {
break
}
letter := u.buffer[i]
if letter == 0 {
break
}
name << letter
i++
}
return name.bytestr()
}
// checksum_ok verifies the validity for dir and files blocks.
fn (mut u Untar) checksum_ok() bool {
mut v := u64(0)
for n := 0; n < 512; n++ {
if n < 148 || n > 155 {
v += u.buffer[n]
} else {
v += 0x20
}
}
parse := u.extract_octal(148, 8)
return v == parse
}
// extract_octal reads an octal number at block position `pos` with a given number of `digits`.
fn (mut u Untar) extract_octal(pos int, digits int) u64 {
mut i := u64(0)
mut p := pos
mut n := digits
for {
if (u.buffer[p] < `0` || u.buffer[p] > `7`) && n > 0 {
p++
n--
} else {
break
}
}
for {
if u.buffer[p] >= `0` && u.buffer[p] <= `7` && n > 0 {
i *= 8
i += u8(u.buffer[p] - `0`)
p++
n--
} else {
break
}
}
return i
}
@[heap]
struct LongPath {
mut:
name []u8
last_pos int
}
// new_long_path builds a LongPath with a fixed maximum name size
fn new_long_path(size int) &LongPath {
return &LongPath{
name: []u8{len: size}
}
}
// appends copies the data to the
fn (mut l LongPath) append(data []u8) {
if l.name.len >= l.last_pos + data.len {
for i, d in data {
l.name[l.last_pos + i] = d
}
l.last_pos += data.len
}
}
// get_path returns the string from name appended as C string.
fn (l LongPath) get_path() string {
mut s := []u8{}
for n in l.name {
if n == 0 {
break
}
s << n
}
return s.bytestr()
}