z1brc

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | README

commit 25f18c4261eac8834bb96809c5e3b9f17b5d536d
Author: Martin Ashby <martin@ashbysoft.com>
Date:   Tue, 30 Jan 2024 11:39:05 +0000

Initial

Diffstat:
A.gitignore | 2++
AREADME.md | 5+++++
Abuild.zig | 91+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Abuild.zig.zon | 62++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/main.zig | 155+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/root.zig | 10++++++++++
6 files changed, 325 insertions(+), 0 deletions(-)

diff --git a/.gitignore b/.gitignore @@ -0,0 +1,2 @@ +zig-out +zig-cache/ diff --git a/README.md b/README.md @@ -0,0 +1,5 @@ +# z1brc + +Zig version of [One Billion Row Challenge](https://github.com/gunnarmorling/1brc) + +This is my naïve solution. diff --git a/build.zig b/build.zig @@ -0,0 +1,91 @@ +const std = @import("std"); + +// Although this function looks imperative, note that its job is to +// declaratively construct a build graph that will be executed by an external +// runner. +pub fn build(b: *std.Build) void { + // Standard target options allows the person running `zig build` to choose + // what target to build for. Here we do not override the defaults, which + // means any target is allowed, and the default is native. Other options + // for restricting supported target set are available. + const target = b.standardTargetOptions(.{}); + + // Standard optimization options allow the person running `zig build` to select + // between Debug, ReleaseSafe, ReleaseFast, and ReleaseSmall. Here we do not + // set a preferred release mode, allowing the user to decide how to optimize. + const optimize = b.standardOptimizeOption(.{}); + + const lib = b.addStaticLibrary(.{ + .name = "z1brc", + // In this case the main source file is merely a path, however, in more + // complicated build scripts, this could be a generated file. + .root_source_file = .{ .path = "src/root.zig" }, + .target = target, + .optimize = optimize, + }); + + // This declares intent for the library to be installed into the standard + // location when the user invokes the "install" step (the default step when + // running `zig build`). + b.installArtifact(lib); + + const exe = b.addExecutable(.{ + .name = "z1brc", + .root_source_file = .{ .path = "src/main.zig" }, + .target = target, + .optimize = optimize, + }); + + // This declares intent for the executable to be installed into the + // standard location when the user invokes the "install" step (the default + // step when running `zig build`). + b.installArtifact(exe); + + // This *creates* a Run step in the build graph, to be executed when another + // step is evaluated that depends on it. The next line below will establish + // such a dependency. + const run_cmd = b.addRunArtifact(exe); + + // By making the run step depend on the install step, it will be run from the + // installation directory rather than directly from within the cache directory. + // This is not necessary, however, if the application depends on other installed + // files, this ensures they will be present and in the expected location. + run_cmd.step.dependOn(b.getInstallStep()); + + // This allows the user to pass arguments to the application in the build + // command itself, like this: `zig build run -- arg1 arg2 etc` + if (b.args) |args| { + run_cmd.addArgs(args); + } + + // This creates a build step. It will be visible in the `zig build --help` menu, + // and can be selected like this: `zig build run` + // This will evaluate the `run` step rather than the default, which is "install". + const run_step = b.step("run", "Run the app"); + run_step.dependOn(&run_cmd.step); + + // Creates a step for unit testing. This only builds the test executable + // but does not run it. + const lib_unit_tests = b.addTest(.{ + .root_source_file = .{ .path = "src/root.zig" }, + .target = target, + .optimize = optimize, + }); + + const run_lib_unit_tests = b.addRunArtifact(lib_unit_tests); + + const exe_unit_tests = b.addTest(.{ + .root_source_file = .{ .path = "src/main.zig" }, + .target = target, + .optimize = optimize, + }); + + const run_exe_unit_tests = b.addRunArtifact(exe_unit_tests); + + // Similar to creating the run step earlier, this exposes a `test` step to + // the `zig build --help` menu, providing a way for the user to request + // running the unit tests. + const test_step = b.step("test", "Run unit tests"); + test_step.dependOn(&run_lib_unit_tests.step); + test_step.dependOn(&run_exe_unit_tests.step); +} diff --git a/build.zig.zon b/build.zig.zon @@ -0,0 +1,62 @@ +.{ + .name = "z1brc", + // This is a [Semantic Version](https://semver.org/). + // In a future version of Zig it will be used for package deduplication. + .version = "0.0.0", + + // This field is optional. + // This is currently advisory only; Zig does not yet do anything + // with this value. + //.minimum_zig_version = "0.11.0", + + // This field is optional. + // Each dependency must either provide a `url` and `hash`, or a `path`. + // `zig build --fetch` can be used to fetch all dependencies of a package, recursively. + // Once all dependencies are fetched, `zig build` no longer requires + // internet connectivity. + .dependencies = .{ + // See `zig fetch --save <url>` for a command-line interface for adding dependencies. + //.example = .{ + // // When updating this field to a new URL, be sure to delete the corresponding + // // `hash`, otherwise you are communicating that you expect to find the old hash at + // // the new URL. + // .url = "https://example.com/foo.tar.gz", + // + // // This is computed from the file contents of the directory of files that is + // // obtained after fetching `url` and applying the inclusion rules given by + // // `paths`. + // // + // // This field is the source of truth; packages do not come from a `url`; they + // // come from a `hash`. `url` is just one of many possible mirrors for how to + // // obtain a package matching this `hash`. + // // + // // Uses the [multihash](https://multiformats.io/multihash/) format. + // .hash = "...", + // + // // When this is provided, the package is found in a directory relative to the + // // build root. In this case the package's hash is irrelevant and therefore not + // // computed. This field and `url` are mutually exclusive. + // .path = "foo", + //}, + }, + + // Specifies the set of files and directories that are included in this package. + // Only files and directories listed here are included in the `hash` that + // is computed for this package. + // Paths are relative to the build root. Use the empty string (`""`) to refer to + // the build root itself. + // A directory listed here means that all files within, recursively, are included. + .paths = .{ + // This makes *all* files, recursively, included in this package. It is generally + // better to explicitly list the files and directories instead, to insure that + // fetching from tarballs, file system paths, and version control all result + // in the same contents hash. + "", + // For example... + //"build.zig", + //"build.zig.zon", + //"src", + //"LICENSE", + //"README.md", + }, +} diff --git a/src/main.zig b/src/main.zig @@ -0,0 +1,155 @@ +const std = @import("std"); + +pub fn main() !void { + var t = try std.time.Timer.start(); + std.log.err("start!", .{}); + + var gpa = std.heap.GeneralPurposeAllocator(.{}){}; + defer _ = gpa.deinit(); + const a = gpa.allocator(); + + var args = std.process.args(); + defer args.deinit(); + if (!args.skip()) @panic("program name wasn't supplied wtf"); + const infile_name = args.next() orelse return error.NoInputFile; + const infile = try open_mmap(std.fs.cwd(), infile_name); + defer std.os.munmap(infile); + + const out = try run(a, infile, &t); + defer a.free(out); + try std.io.getStdOut().writeAll(out); + + std.log.err("finished at {} s", .{t.read() / std.time.ns_per_s}); +} + +fn run(a: std.mem.Allocator, infile: []const u8, t: *std.time.Timer) ![]const u8 { + + std.log.err("mmap done, iterating!", .{}); + var res = std.StringArrayHashMap(Accumulator).init(a); + defer { + var it = res.iterator(); + while (it.next()) |e| { + a.free(e.key_ptr.*); + } + res.deinit(); + } + var lines = std.mem.tokenizeScalar(u8, infile, '\n'); + var ct: usize = 0; + while (lines.next()) |line| { + ct += 1; + if (ct % 100000 == 0) { + std.log.err("processed {} lines at {} seconds", .{ct, t.read() / std.time.ns_per_s}); + } + var spl = std.mem.splitScalar(u8, line, ';'); + const key = spl.first(); + const val_s = spl.next() orelse return error.Malformatted; + var val: u16 = 0; + var is_neg: bool = false; + for (val_s) |c| { + if (c == '-') { + is_neg = true; + } else if (c >= '0' and c <= '9') { + const x = c - '0'; + val *= 10; + val += x; + } + } + if (!is_neg) { + val += 999; + } + const kd = try a.dupe(u8, key); + const gpr = try res.getOrPut(kd); + if (gpr.found_existing) { + a.free(kd); + const e = gpr.value_ptr.*; + gpr.value_ptr.* = .{ + .min = @min(e.min, val), + .max = @max(e.min, val), + .sum = e.sum + val, + .count = e.count + 1, + }; + } else { + gpr.value_ptr.* = .{ + .min = val, + .max = val, + .sum = val, + .count = 1, + }; + } + } + + // Go theough the keys sorted + // OK so i think I should use integers rather than actual floating point values. + // -999 -> 999 maps to positive only 0 -> 1998 + // so I guess go with u16? + + const Srt = struct { + keys: [][]const u8, + pub fn lessThan(self: @This(), a_index: usize, b_index: usize) bool { + // character value order! + return std.mem.order(u8, self.keys[a_index], self.keys[b_index]).compare(.lt); + } + }; + res.sort(Srt{.keys = res.keys()}); + + var rr = std.ArrayList(u8).init(a); + defer rr.deinit(); + var ww = rr.writer(); + try ww.writeAll("{"); + var it = res.iterator(); + while (it.next()) |nxt| { + const k = nxt.key_ptr.*; + try ww.writeAll(k); + try ww.writeAll("="); + const v = nxt.value_ptr.*; + const mm = @as(i32, v.min) - 999; + try std.fmt.format(ww, "{}.{}", .{@divFloor(mm ,10),@mod(mm, 10)}); + try ww.writeAll("/"); + const mx = @as(i32, v.max) - 999; + try std.fmt.format(ww, "{}.{}", .{@divFloor(mx,10), @mod(mx, 10)}); + try ww.writeAll("/"); + const me_a = v.sum / v.count; + const me = @as(i32, @intCast(me_a)) - 999; + try std.fmt.format(ww, "{}.{}", .{@divFloor(me ,10), @mod(me, 10)}); + try ww.writeAll(", "); + } + try ww.writeAll("}"); + return try rr.toOwnedSlice(); +} + +const Accumulator = struct { + min: u16, + max: u16, + sum: u64, + count: u64, +}; + +// Result must be closed with std.os.munmap +fn open_mmap(dir: std.fs.Dir, file_path: []const u8) ![]align(std.mem.page_size) u8 { + var f = try dir.openFile(file_path, .{ .mode = .read_only }); + defer f.close(); + const stat = try f.stat(); + return try std.os.mmap(null, stat.size, std.os.PROT.READ, std.os.MAP.PRIVATE, f.handle, 0); +} + +const test_input = +\\Hamburg;12.0 +\\Bulawayo;8.9 +\\Palembang;38.8 +\\St. John's;15.2 +\\Cracow;12.6 +\\Bridgetown;26.9 +\\Istanbul;6.2 +\\Roseau;34.4 +\\Conakry;31.2 +\\Istanbul;23.0 +; +const test_output = +\\{Bridgetown=26.9/26.9/26.9, Bulawayo=8.9/8.9/8.9, Conakry=31.2/31.2/31.2, Cracow=12.6/12.6/12.6, Hamburg=12.0/12.0/12.0, Istanbul=6.2/23.0/14.6, Palembang=38.8/38.8/38.8, Roseau=34.4/34.4/34.4, St. John's=15.2/15.2/15.2, } +; +test { + const a = std.testing.allocator; + const out = try run(a, test_input); + defer a.free(out); + try std.testing.expectEqualStrings(test_output, out); +} diff --git a/src/root.zig b/src/root.zig @@ -0,0 +1,10 @@ +const std = @import("std"); +const testing = std.testing; + +export fn add(a: i32, b: i32) i32 { + return a + b; +} + +test "basic add functionality" { + try testing.expect(add(3, 7) == 10); +}