commit d28b913247517105fbd53c940c262d85b7c84314 Author: Nugraha Date: Tue Jul 5 04:52:54 2022 +0700 all: initial files Signed-off-by: Nugraha diff --git a/README.md b/README.md new file mode 100644 index 0000000..07d55af --- /dev/null +++ b/README.md @@ -0,0 +1 @@ +# gouring \ No newline at end of file diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..546250a --- /dev/null +++ b/go.mod @@ -0,0 +1,11 @@ +module github.com/ii64/gouring + +go 1.18 + +require github.com/stretchr/testify v1.7.0 + +require ( + github.com/davecgh/go-spew v1.1.0 // indirect + github.com/pmezard/go-difflib v1.0.0 // indirect + gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c // indirect +) diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..acb88a4 --- /dev/null +++ b/go.sum @@ -0,0 +1,11 @@ +github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY= +github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c h1:dUUwHk2QECo/6vqA44rthZ8ie2QXMNeKRTHCNY2nXvo= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/hdr.go b/hdr.go new file mode 100644 index 0000000..891cfb8 --- /dev/null +++ b/hdr.go @@ -0,0 +1,629 @@ +/* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) OR MIT */ +/* + * Header file for the io_uring interface. + * + * Copyright (C) 2019 Jens Axboe + * Copyright (C) 2019 Christoph Hellwig + */ +package gouring + +/* + * IO submission data structure (Submission Queue Entry) + */ +type IoUringSqe_Union1 union64 + +func (u *IoUringSqe_Union1) SetOffset(v uint64) { (*union64)(u).PutUint64(v) } +func (u *IoUringSqe_Union1) SetAddr2(v uint64) { (*union64)(u).PutUint64(v) } + +type IoUringSqe_Union2 union64 + +func (u *IoUringSqe_Union2) SetAddr(v uint64) { (*union64)(u).PutUint64(v) } +func (u *IoUringSqe_Union2) SetSpliceOffsetIn(v uint64) { (*union64)(u).PutUint64(v) } + +type IoUringSqe_Union3 union32 + +func (u *IoUringSqe_Union3) SetRwFlags(v uint32) { (*union32)(u).PutUint32(v) } +func (u *IoUringSqe_Union3) SetPollEvents(v uint16) { (*union32)(u).PutUint16(v) } +func (u *IoUringSqe_Union3) SetPoll32Events(v uint32) { (*union32)(u).PutUint32(v) } +func (u *IoUringSqe_Union3) SetSyncRangeFlags(v uint32) { (*union32)(u).PutUint32(v) } +func (u *IoUringSqe_Union3) SetMsgFlags(v uint32) { (*union32)(u).PutUint32(v) } +func (u *IoUringSqe_Union3) SetTimeoutFlags(v uint32) { (*union32)(u).PutUint32(v) } +func (u *IoUringSqe_Union3) SetAcceptFlags(v uint32) { (*union32)(u).PutUint32(v) } +func (u *IoUringSqe_Union3) SetCancelFlags(v uint32) { (*union32)(u).PutUint32(v) } +func (u *IoUringSqe_Union3) SetOpenFlags(v uint32) { (*union32)(u).PutUint32(v) } +func (u *IoUringSqe_Union3) SetStatxFlags(v uint32) { (*union32)(u).PutUint32(v) } +func (u *IoUringSqe_Union3) SetFadviseAdvice(v uint32) { (*union32)(u).PutUint32(v) } +func (u *IoUringSqe_Union3) SetSpliceFlags(v uint32) { (*union32)(u).PutUint32(v) } +func (u *IoUringSqe_Union3) SetRenameFlags(v uint32) { (*union32)(u).PutUint32(v) } +func (u *IoUringSqe_Union3) SetUnlinkFlags(v uint32) { (*union32)(u).PutUint32(v) } +func (u *IoUringSqe_Union3) SetHardlinkFlags(v uint32) { (*union32)(u).PutUint32(v) } +func (u *IoUringSqe_Union3) SetXattrFlags(v uint32) { (*union32)(u).PutUint32(v) } +func (u *IoUringSqe_Union3) SetOpFlags(v uint32) { (*union32)(u).PutUint32(v) } //generic +func (u *IoUringSqe_Union3) GetOpFlags() uint32 { return (*union32)(u).Uint32() } //generic + +type IoUringSqe_Union4 union16 + +func (u *IoUringSqe_Union4) SetBufIndex(v uint16) { (*union16)(u).PutUint16(v) } +func (u *IoUringSqe_Union4) SetBufGroup(v uint16) { (*union16)(u).PutUint16(v) } + +type IoUringSqe_Union5 union32 + +func (u *IoUringSqe_Union5) SetSpliceFdIn(v int32) { (*union32)(u).PutInt32(v) } +func (u *IoUringSqe_Union5) SetFileIndex(v uint32) { (*union32)(u).PutUint32(v) } + +type IoUringSqe struct { + Opcode IoUringOp /* type of operation for this sqe */ + Flags uint8 /* IOSQE_ flags */ + IoPrio uint16 /* ioprio for the request */ + Fd int32 /* file descriptor to do IO on */ + + // union { + // __u64 off; /* offset into file */ + // __u64 addr2; + // }; + IoUringSqe_Union1 + + // union { + // __u64 addr; /* pointer to buffer or iovecs */ + // __u64 splice_off_in; + // }; + IoUringSqe_Union2 + + Len uint32 /* buffer size or number of iovecs */ + + // union { + // __kernel_rwf_t rw_flags; + // __u32 fsync_flags; + // __u16 poll_events; /* compatibility */ + // __u32 poll32_events; /* word-reversed for BE */ + // __u32 sync_range_flags; + // __u32 msg_flags; + // __u32 timeout_flags; + // __u32 accept_flags; + // __u32 cancel_flags; + // __u32 open_flags; + // __u32 statx_flags; + // __u32 fadvise_advice; + // __u32 splice_flags; + // __u32 rename_flags; + // __u32 unlink_flags; + // __u32 hardlink_flags; + // __u32 xattr_flags; + // }; + IoUringSqe_Union3 + + UserData uint64 /* data to be passed back at completion time */ + + /* pack this to avoid bogus arm OABI complaints */ + // union { + // /* index into fixed buffers, if used */ + // __u16 buf_index; + // /* for grouped buffer selection */ + // __u16 buf_group; + // } __attribute__((packed)); + IoUringSqe_Union4 + + /* personality to use, if used */ + Personality uint16 + + // union { + // __s32 splice_fd_in; + // __u32 file_index; + // }; + IoUringSqe_Union5 + + Addr3 uint64 + __pad2 [1]uint64 +} + +/* + * If sqe->file_index is set to this for opcodes that instantiate a new + * direct descriptor (like openat/openat2/accept), then io_uring will allocate + * an available direct descriptor instead of having the application pass one + * in. The picked direct descriptor will be returned in cqe->res, or -ENFILE + * if the space is full. + */ +const IORING_FILE_INDEX_ALLOC = ^uint32(0) + +const ( + IOSQE_FIXED_FILE_BIT = iota + IOSQE_IO_DRAIN_BIT + IOSQE_IO_LINK_BIT + IOSQE_IO_HARDLINK_BIT + IOSQE_ASYNC_BIT + IOSQE_BUFFER_SELECT_BIT + IOSQE_CQE_SKIP_SUCCESS_BIT +) + +/* + * sqe->flags + */ +const ( + /* use fixed fileset */ + IOSQE_FIXED_FILE = (1 << IOSQE_FIXED_FILE_BIT) + /* issue after inflight IO */ + IOSQE_IO_DRAIN = (1 << IOSQE_IO_DRAIN_BIT) + /* links next sqe */ + IOSQE_IO_LINK = (1 << IOSQE_IO_LINK_BIT) + /* like LINK, but stronger */ + IOSQE_IO_HARDLINK = (1 << IOSQE_IO_HARDLINK_BIT) + /* always go async */ + IOSQE_ASYNC = (1 << IOSQE_ASYNC_BIT) + /* select buffer from sqe->buf_group */ + IOSQE_BUFFER_SELECT = (1 << IOSQE_BUFFER_SELECT_BIT) + /* don't post CQE if request succeeded */ + IOSQE_CQE_SKIP_SUCCESS = (1 << IOSQE_CQE_SKIP_SUCCESS_BIT) +) + +/* + * io_uring_setup() flags + */ +const ( + IORING_SETUP_IOPOLL = (1 << 0) /* io_context is polled */ + IORING_SETUP_SQPOLL = (1 << 1) /* SQ poll thread */ + IORING_SETUP_SQ_AFF = (1 << 2) /* sq_thread_cpu is valid */ + IORING_SETUP_CQSIZE = (1 << 3) /* app defines CQ size */ + IORING_SETUP_CLAMP = (1 << 4) /* clamp SQ/CQ ring sizes */ + IORING_SETUP_ATTACH_WQ = (1 << 5) /* attach to existing wq */ + IORING_SETUP_R_DISABLED = (1 << 6) /* start with ring disabled */ + IORING_SETUP_SUBMIT_ALL = (1 << 7) /* continue submit on error */ +) + +/* + * Cooperative task running. When requests complete, they often require + * forcing the submitter to transition to the kernel to complete. If this + * flag is set, work will be done when the task transitions anyway, rather + * than force an inter-processor interrupt reschedule. This avoids interrupting + * a task running in userspace, and saves an IPI. + */ +const IORING_SETUP_COOP_TASKRUN = (1 << 8) + +/* + * If COOP_TASKRUN is set, get notified if task work is available for + * running and a kernel transition would be needed to run it. This sets + * IORING_SQ_TASKRUN in the sq ring flags. Not valid with COOP_TASKRUN. + */ +const IORING_SETUP_TASKRUN_FLAG = (1 << 9) + +const IORING_SETUP_SQE128 = (1 << 10) /* SQEs are 128 byte */ +const IORING_SETUP_CQE32 = (1 << 11) /* CQEs are 32 byte */ + +type IoUringOp = uint8 + +const ( + IORING_OP_NOP IoUringOp = iota + IORING_OP_READV + IORING_OP_WRITEV + IORING_OP_FSYNC + IORING_OP_READ_FIXED + IORING_OP_WRITE_FIXED + IORING_OP_POLL_ADD + IORING_OP_POLL_REMOVE + IORING_OP_SYNC_FILE_RANGE + IORING_OP_SENDMSG + IORING_OP_RECVMSG + IORING_OP_TIMEOUT + IORING_OP_TIMEOUT_REMOVE + IORING_OP_ACCEPT + IORING_OP_ASYNC_CANCEL + IORING_OP_LINK_TIMEOUT + IORING_OP_CONNECT + IORING_OP_FALLOCATE + IORING_OP_OPENAT + IORING_OP_CLOSE + IORING_OP_FILES_UPDATE + IORING_OP_STATX + IORING_OP_READ + IORING_OP_WRITE + IORING_OP_FADVISE + IORING_OP_MADVISE + IORING_OP_SEND + IORING_OP_RECV + IORING_OP_OPENAT2 + IORING_OP_EPOLL_CTL + IORING_OP_SPLICE + IORING_OP_PROVIDE_BUFFERS + IORING_OP_REMOVE_BUFFERS + IORING_OP_TEE + IORING_OP_SHUTDOWN + IORING_OP_RENAMEAT + IORING_OP_UNLINKAT + IORING_OP_MKDIRAT + IORING_OP_SYMLINKAT + IORING_OP_LINKAT + IORING_OP_MSG_RING + IORING_OP_FSETXATTR + IORING_OP_SETXATTR + IORING_OP_FGETXATTR + IORING_OP_GETXATTR + IORING_OP_SOCKET + IORING_OP_URING_CMD + + /* this goes last, obviously */ + IORING_OP_LAST +) + +/* + * sqe->fsync_flags + */ +const IORING_FSYNC_DATASYNC = (1 << 0) + +/* + * sqe->timeout_flags + */ +const ( + IORING_TIMEOUT_ABS = (1 << 0) + IORING_TIMEOUT_UPDATE = (1 << 1) + IORING_TIMEOUT_BOOTTIME = (1 << 2) + IORING_TIMEOUT_REALTIME = (1 << 3) + IORING_LINK_TIMEOUT_UPDATE = (1 << 4) + IORING_TIMEOUT_ETIME_SUCCESS = (1 << 5) + IORING_TIMEOUT_CLOCK_MASK = (IORING_TIMEOUT_BOOTTIME | IORING_TIMEOUT_REALTIME) + IORING_TIMEOUT_UPDATE_MASK = (IORING_TIMEOUT_UPDATE | IORING_LINK_TIMEOUT_UPDATE) +) + +/* + * sqe->splice_flags + * extends splice(2) flags + */ +const SPLICE_F_FD_IN_FIXED = (1 << 31) /* the last bit of __u32 */ + +/* + * POLL_ADD flags. Note that since sqe->poll_events is the flag space, the + * command flags for POLL_ADD are stored in sqe->len. + * + * IORING_POLL_ADD_MULTI Multishot poll. Sets IORING_CQE_F_MORE if + * the poll handler will continue to report + * CQEs on behalf of the same SQE. + * + * IORING_POLL_UPDATE Update existing poll request, matching + * sqe->addr as the old user_data field. + */ +const ( + IORING_POLL_ADD_MULTI = (1 << 0) + IORING_POLL_UPDATE_EVENTS = (1 << 1) + IORING_POLL_UPDATE_USER_DATA = (1 << 2) +) + +/* + * ASYNC_CANCEL flags. + * + * IORING_ASYNC_CANCEL_ALL Cancel all requests that match the given key + * IORING_ASYNC_CANCEL_FD Key off 'fd' for cancelation rather than the + * request 'user_data' + * IORING_ASYNC_CANCEL_ANY Match any request + */ +const ( + IORING_ASYNC_CANCEL_ALL = (1 << 0) + IORING_ASYNC_CANCEL_FD = (1 << 1) + IORING_ASYNC_CANCEL_ANY = (1 << 2) +) + +/* + * send/sendmsg and recv/recvmsg flags (sqe->addr2) + * + * IORING_RECVSEND_POLL_FIRST If set, instead of first attempting to send + * or receive and arm poll if that yields an + * -EAGAIN result, arm poll upfront and skip + * the initial transfer attempt. + */ +const IORING_RECVSEND_POLL_FIRST = (1 << 0) + +/* + * accept flags stored in sqe->ioprio + */ +const IORING_ACCEPT_MULTISHOT = (1 << 0) + +/* + * IO completion data structure (Completion Queue Entry) + */ +type IoUringCqe struct { + UserData uint64 /* sqe->data submission passed back */ + Res int32 /* result code for this event */ + Flags uint32 + + /* + * If the ring is initialized with IORING_SETUP_CQE32, then this field + * contains 16-bytes of padding, doubling the size of the CQE. + */ + // __u64 big_cqe[]; + BigCqe *uint64 // ptr arith +} + +/* + * cqe->flags + * + * IORING_CQE_F_BUFFER If set, the upper 16 bits are the buffer ID + * IORING_CQE_F_MORE If set, parent SQE will generate more CQE entries + * IORING_CQE_F_SOCK_NONEMPTY If set, more data to read after socket recv + */ + +const ( + IORING_CQE_F_BUFFER = (1 << 0) + IORING_CQE_F_MORE = (1 << 1) + IORING_CQE_F_SOCK_NONEMPTY = (1 << 2) +) + +const ( + IORING_CQE_BUFFER_SHIFT = 16 +) + +/* + * Magic offsets for the application to mmap the data it needs + */ +const ( + IORING_OFF_SQ_RING = 0 + IORING_OFF_CQ_RING = 0x8000000 + IORING_OFF_SQES = 0x10000000 +) + +/* + * Filled with the offset for mmap(2) + */ + +type IoSqringOffsets struct { + Head uint32 + Tail uint32 + RingMask uint32 + RingEntries uint32 + Flags uint32 + Dropped uint32 + Array uint32 + resv1 uint32 + resv2 uint64 +} + +/* + * sq_ring->flags + */ +const ( + IORING_SQ_NEED_WAKEUP = (1 << 0) /* needs io_uring_enter wakeup */ + IORING_SQ_CQ_OVERFLOW = (1 << 1) /* CQ ring is overflown */ + IORING_SQ_TASKRUN = (1 << 2) /* task should enter the kernel */ +) + +type IoCqringOffsets struct { + Head uint32 + Tail uint32 + RingMask uint32 + RingEntries uint32 + Overflow uint32 + Cqes uint32 + Flags uint32 + resv1 uint32 + resv2 uint64 +} + +/* + * cq_ring->flags + */ + +/* disable eventfd notifications */ +const IORING_CQ_EVENTFD_DISABLED = (1 << 0) + +/* + * io_uring_enter(2) flags + */ +const ( + IORING_ENTER_GETEVENTS = (1 << 0) + IORING_ENTER_SQ_WAKEUP = (1 << 1) + IORING_ENTER_SQ_WAIT = (1 << 2) + IORING_ENTER_EXT_ARG = (1 << 3) + IORING_ENTER_REGISTERED_RING = (1 << 4) +) + +/* + * Passed in for io_uring_setup(2). Copied back with updated info on success + */ + +type IoUringParams struct { + SqEntries uint32 + CqEntries uint32 + Flags uint32 + SqThreadCpu uint32 + SqThreadIdle uint32 + Features uint32 + WqFd uint32 + resv [3]uint32 + SqOff IoSqringOffsets + CqOff IoCqringOffsets +} + +/* + * io_uring_params->features flags + */ +const ( + IORING_FEAT_SINGLE_MMAP = (1 << 0) + IORING_FEAT_NODROP = (1 << 1) + IORING_FEAT_SUBMIT_STABLE = (1 << 2) + IORING_FEAT_RW_CUR_POS = (1 << 3) + IORING_FEAT_CUR_PERSONALITY = (1 << 4) + IORING_FEAT_FAST_POLL = (1 << 5) + IORING_FEAT_POLL_32BITS = (1 << 6) + IORING_FEAT_SQPOLL_NONFIXED = (1 << 7) + IORING_FEAT_EXT_ARG = (1 << 8) + IORING_FEAT_NATIVE_WORKERS = (1 << 9) + IORING_FEAT_RSRC_TAGS = (1 << 10) + IORING_FEAT_CQE_SKIP = (1 << 11) + IORING_FEAT_LINKED_FILE = (1 << 12) +) + +/* + * io_uring_register(2) opcodes and arguments + */ +const ( + IORING_REGISTER_BUFFERS = 0 + IORING_UNREGISTER_BUFFERS = 1 + IORING_REGISTER_FILES = 2 + IORING_UNREGISTER_FILES = 3 + IORING_REGISTER_EVENTFD = 4 + IORING_UNREGISTER_EVENTFD = 5 + IORING_REGISTER_FILES_UPDATE = 6 + IORING_REGISTER_EVENTFD_ASYNC = 7 + IORING_REGISTER_PROBE = 8 + IORING_REGISTER_PERSONALITY = 9 + IORING_UNREGISTER_PERSONALITY = 10 + IORING_REGISTER_RESTRICTIONS = 11 + IORING_REGISTER_ENABLE_RINGS = 12 + + /* extended with tagging */ + IORING_REGISTER_FILES2 = 13 + IORING_REGISTER_FILES_UPDATE2 = 14 + IORING_REGISTER_BUFFERS2 = 15 + IORING_REGISTER_BUFFERS_UPDATE = 16 + + /* set/clear io-wq thread affinities */ + IORING_REGISTER_IOWQ_AFF = 17 + IORING_UNREGISTER_IOWQ_AFF = 18 + + /* set/get max number of io-wq workers */ + IORING_REGISTER_IOWQ_MAX_WORKERS = 19 + + /* register/unregister io_uring fd with the ring */ + IORING_REGISTER_RING_FDS = 20 + IORING_UNREGISTER_RING_FDS = 21 + + /* register ring based provide buffer group */ + IORING_REGISTER_PBUF_RING = 22 + IORING_UNREGISTER_PBUF_RING = 23 + + /* this goes last */ + IORING_REGISTER_LAST +) + +/* io-wq worker categories */ +const ( + IO_WQ_BOUND = iota + IO_WQ_UNBOUND +) + +/* deprecated, see struct IoUringRsrcUpdate */ +type IoUringFilesUpdate struct { + Offset uint32 + resv uint32 + Fds uint64 // __aligned_u64/* __s32 * */ +} + +/* + * Register a fully sparse file space, rather than pass in an array of all + * -1 file descriptors. + */ +const IORING_RSRC_REGISTER_SPARSE = (1 << 0) + +type IoUringRsrcRegister struct { + Nr uint32 + Flags uint32 + resv2 uint64 + Data uint64 // __aligned_u64 + Tags uint64 // __aligned_u64 +} + +type IoUringRsrcUpdate struct { + Offset uint32 + resv uint32 + Data uint64 // __aligned_u64 +} + +type IoUringRsrcUpdate2 struct { + Offset uint32 + resv uint32 + Data uint64 // __aligned_u64 + Tags uint64 // __aligned_u64 + Nr uint32 + resv2 uint32 +} + +/* Skip updating fd indexes set to this value in the fd table */ +const IORING_REGISTER_FILES_SKIP = (-2) + +const IO_URING_OP_SUPPORTED = (1 << 0) + +type IoUringProbeOp struct { + op uint8 + resv uint8 + flags uint16 /* IO_URING_OP_* flags */ + resv2 uint32 +} + +type IoUringProbe struct { + last_op uint8 /* last opcode supported */ + uint8 /* length of ops[] array below */ + resv uint16 + resv2 [3]uint32 + ops [0]IoUringProbeOp +} + +type IoUringRestriction struct { + opcode uint16 + // union { + // __u8 register_op; /* IORING_RESTRICTION_REGISTER_OP */ + // __u8 sqe_op; /* IORING_RESTRICTION_SQE_OP */ + // __u8 sqe_flags; /* IORING_RESTRICTION_SQE_FLAGS_* */ + // }; + Union1 uint8 + resv uint8 + resv2 [3]uint32 +} + +type IoUringBuf struct { + Addr uint64 + Len uint32 + Bid uint16 + resv uint16 +} + +type IoUringBufRing struct { + // union { + /* + * To avoid spilling into more pages than we need to, the + * ring tail is overlaid with the IoUringBuf->resv field. + */ + Anon0 struct { + resv1 uint64 + resv2 uint32 + resv3 uint16 + Tail uint16 + } + // bufs [0]IoUringBuf + // }; +} + +/* argument for IORING_(UN)REGISTER_PBUF_RING */ +type IoUringBufReg struct { + RingAddr uint64 + RingEntries uint32 + Bgid uint16 + Pad uint16 + resv [3]uint64 +} + +/* + * IoUringRestriction->opcode values + */ +const ( + /* Allow an io_uring_register(2) opcode */ + IORING_RESTRICTION_REGISTER_OP = 0 + + /* Allow an sqe opcode */ + IORING_RESTRICTION_SQE_OP = 1 + + /* Allow sqe flags */ + IORING_RESTRICTION_SQE_FLAGS_ALLOWED = 2 + + /* Require sqe flags (these flags must be set on each submission) */ + IORING_RESTRICTION_SQE_FLAGS_REQUIRED = 3 + + IORING_RESTRICTION_LAST +) + +type IoUringGeteventsArg struct { + Sigmask uint64 + SigmaskSz uint32 + Pad uint32 + Ts uint64 +} + +/* + * accept flags stored in sqe->ioprio + */ +// const IORING_ACCEPT_MULTISHOT = (1 << 0) diff --git a/hdr_int_flags.go b/hdr_int_flags.go new file mode 100644 index 0000000..5f1b581 --- /dev/null +++ b/hdr_int_flags.go @@ -0,0 +1,3 @@ +package gouring + +const INT_FLAG_REG_RING uint8 = 1 diff --git a/hdr_struct.go b/hdr_struct.go new file mode 100644 index 0000000..7eaafcd --- /dev/null +++ b/hdr_struct.go @@ -0,0 +1,60 @@ +package gouring + +import "unsafe" + +var ( + SizeofUnsigned = unsafe.Sizeof(uint32(0)) + SizeofUint32 = unsafe.Sizeof(uint32(0)) + SizeofIoUringSqe = unsafe.Sizeof(IoUringSqe{}) + SizeofIoUringCqe = unsafe.Sizeof(IoUringCqe{}) +) + +type IoUring struct { + Sq IoUringSq + Cq IoUringCq + Flags uint32 + RingFd int32 + + Features uint32 + EnterRingFd int32 + IntFlags uint8 + + pad [3]uint8 + pad2 uint32 +} + +type IoUringSq struct { + Head *uint32 + Tail *uint32 + RingMask *uint32 + RingEntries *uint32 + Flags *uint32 + Dropped *uint32 + + Array uint32Array //ptr arith + Sqes ioUringSqeArray //ptr arith + + SqeHead uint32 + SqeTail uint32 + + RingSz uint32 + RingPtr uintptr + + pad [4]uint32 +} + +type IoUringCq struct { + Head *uint32 + Tail *uint32 + RingMask *uint32 + RingEntries *uint32 + Flags *uint32 + Overflow *uint32 + + Cqes ioUringCqeArray //ptr arith + + RingSz uint32 + RingPtr uintptr + + pad [4]uint32 +} diff --git a/prep.go b/prep.go new file mode 100644 index 0000000..c75b967 --- /dev/null +++ b/prep.go @@ -0,0 +1,62 @@ +package gouring + +import ( + "syscall" + "unsafe" +) + +func PrepRW(op IoUringOp, sqe *IoUringSqe, fd int, + addr unsafe.Pointer, len int, offset uint64) { + sqe.Opcode = op + sqe.Flags = 0 + sqe.IoPrio = 0 + sqe.Fd = int32(fd) + sqe.SetOffset(offset) + sqe.SetAddr(uint64(uintptr(addr))) + sqe.Len = uint32(len) + sqe.SetOpFlags(0) + sqe.SetBufIndex(0) + sqe.Personality = 0 + sqe.SetFileIndex(0) + sqe.Addr3 = 0 + sqe.__pad2[0] = 0 +} + +func PrepNop(sqe *IoUringSqe) { + PrepRW(IORING_OP_NOP, sqe, -1, nil, 0, 0) +} + +func PrepTimeout(sqe *IoUringSqe, ts *syscall.Timespec, count uint32, flags uint32) { + PrepRW(IORING_OP_TIMEOUT, sqe, -1, unsafe.Pointer(ts), 1, uint64(count)) + sqe.SetTimeoutFlags(flags) +} + +func PrepReadv(sqe *IoUringSqe, fd int, + iov *syscall.Iovec, nrVecs int, + offset uint64) { + PrepRW(IORING_OP_READV, sqe, fd, unsafe.Pointer(iov), nrVecs, offset) +} +func PrepReadv2(sqe *IoUringSqe, fd int, + iov *syscall.Iovec, nrVecs int, + offset uint64, flags uint32) { + PrepReadv(sqe, fd, iov, nrVecs, offset) + sqe.SetRwFlags(flags) +} + +func PrepWritev(sqe *IoUringSqe, fd int, + iov *syscall.Iovec, nrVecs int, + offset uint64) { + PrepRW(IORING_OP_WRITEV, sqe, fd, unsafe.Pointer(iov), nrVecs, offset) +} +func PrepWritev2(sqe *IoUringSqe, fd int, + iov *syscall.Iovec, nrVecs int, + offset uint64, flags uint32) { + PrepWritev(sqe, fd, iov, nrVecs, offset) + sqe.SetRwFlags(flags) +} + +func PrepAccept(sqe *IoUringSqe, fd int, rsa *syscall.RawSockaddrAny, rsaSz *uintptr, flags int) { + *rsaSz = syscall.SizeofSockaddrAny + PrepRW(IORING_OP_ACCEPT, sqe, fd, unsafe.Pointer(rsa), 0, uint64(uintptr(unsafe.Pointer(rsaSz)))) + sqe.SetAcceptFlags(uint32(flags)) +} diff --git a/queue.go b/queue.go new file mode 100644 index 0000000..9a1ce55 --- /dev/null +++ b/queue.go @@ -0,0 +1,428 @@ +package gouring + +import ( + "sync/atomic" + "syscall" + "unsafe" +) + +const LIBURING_UDATA_TIMEOUT uint64 = ^uint64(0) + +/* + * Returns true if we're not using SQ thread (thus nobody submits but us) + * or if IORING_SQ_NEED_WAKEUP is set, so submit thread must be explicitly + * awakened. For the latter case, we set the thread wakeup flag. + */ +func (ring *IoUring) sq_ring_needs_enter(flags *uint32) bool { + if ring.Flags&IORING_SETUP_SQPOLL == 0 { + return true + } + + // FIXME: io_uring_smp_mb + + if atomic.LoadUint32(ring.Sq.Flags)&IORING_SQ_NEED_WAKEUP != 0 { + *flags |= IORING_ENTER_SQ_WAKEUP + return true + } + return false +} + +func (ring *IoUring) cq_ring_needs_flush() bool { + return atomic.LoadUint32(ring.Sq.Flags)&(IORING_SQ_CQ_OVERFLOW|IORING_SQ_TASKRUN) != 0 +} + +func (ring *IoUring) cq_ring_needs_enter() bool { + return (ring.Flags&IORING_SETUP_IOPOLL != 0) || ring.cq_ring_needs_flush() +} + +type get_data struct { + submit uint32 + waitNr uint32 + getFlags uint32 + sz int32 + arg unsafe.Pointer +} + +func (ring *IoUring) _io_uring_get_cqe(cqePtr **IoUringCqe, data *get_data) (err error) { + var cqe *IoUringCqe + var looped = false + var ret int + for { + var needEnter = false + var flags uint32 = 0 + var nrAvail uint32 = 0 + err = ring.__io_uring_peek_cqe(&cqe, &nrAvail) + if err != nil { + break + } + if cqe != nil && data.waitNr == 0 && data.submit == 0 { + if looped || !ring.cq_ring_needs_enter() { + err = syscall.EAGAIN + break + } + needEnter = true + } + if data.waitNr > nrAvail || needEnter { + flags = IORING_ENTER_GETEVENTS | data.getFlags + needEnter = true + } + if data.submit > 0 && ring.sq_ring_needs_enter(&flags) { + needEnter = true + } + if !needEnter { + break + } + + if ring.IntFlags&INT_FLAG_REG_RING != 0 { + flags |= IORING_ENTER_REGISTERED_RING + } + ret, err = io_uring_enter2(ring.EnterRingFd, data.submit, data.waitNr, flags, (*Sigset_t)(data.arg), data.sz) + if err != nil { + break + } + data.submit = data.submit - uint32(ret) + if cqe != nil { + break + } + looped = true + } + + *cqePtr = cqe + return +} + +func (ring *IoUring) __io_uring_get_cqe(cqePtr **IoUringCqe, submit uint32, waitNr uint32, sigmask *Sigset_t) error { + data := &get_data{ + submit: submit, + waitNr: waitNr, + getFlags: 0, + sz: NSIG / 8, + arg: unsafe.Pointer(sigmask), + } + return ring._io_uring_get_cqe(cqePtr, data) +} + +/* + * Fill in an array of IO completions up to count, if any are available. + * Returns the amount of IO completions filled. + */ +func (ring *IoUring) io_uring_peek_batch_cqe(cqes []*IoUringCqe, count uint32) uint32 { + var ready uint32 + var overflowChecked = false + var shift = 0 + if ring.Flags&IORING_SETUP_CQE32 != 0 { + shift = 1 + } + +again: + ready = ring.io_uring_cq_ready() + if ready > 0 { + var head = *ring.Cq.Head + var mask = *ring.Cq.RingMask + var last uint32 + if count > ready { + count = ready + } + last = head + count + var i uintptr + for i = 0; head != last; i++ { + cqes[i] = ioUringCqeArray_Index(ring.Cq.Cqes, uintptr((head&mask)< 0; toSubmit-- { + *uint32Array_Index(sq.Array, uintptr(ktail&mask)) = sq.SqeHead & mask + ktail++ + sq.SqeHead++ + } + + /* + * Ensure that the kernel sees the SQE updates before it sees the tail + * update. + */ + atomic.StoreUint32(sq.Tail, ktail) + +out: + /* + * This _may_ look problematic, as we're not supposed to be reading + * SQ->head without acquire semantics. When we're in SQPOLL mode, the + * kernel submitter could be updating this right now. For non-SQPOLL, + * task itself does it, and there's no potential race. But even for + * SQPOLL, the load is going to be potentially out-of-date the very + * instant it's done, regardless or whether or not it's done + * atomically. Worst case, we're going to be over-estimating what + * we can submit. The point is, we need to be able to deal with this + * situation regardless of any perceived atomicity. + */ + return ktail - *sq.Head +} + +/* + * If we have kernel support for IORING_ENTER_EXT_ARG, then we can use that + * more efficiently than queueing an internal timeout command. + */ +func (ring *IoUring) io_uring_wait_cqes_new(cqePtr **IoUringCqe, waitNtr uint32, ts *syscall.Timespec, sigmask *Sigset_t) error { + arg := &IoUringGeteventsArg{ + Sigmask: uint64(uintptr(unsafe.Pointer(sigmask))), + SigmaskSz: NSIG / 8, + Ts: uint64(uintptr(unsafe.Pointer(ts))), + } + data := &get_data{ + waitNr: waitNtr, + getFlags: IORING_ENTER_EXT_ARG, + sz: int32(unsafe.Sizeof(arg)), + } + return ring._io_uring_get_cqe(cqePtr, data) +} + +/* + * Like io_uring_wait_cqe(), except it accepts a timeout value as well. Note + * that an sqe is used internally to handle the timeout. For kernel doesn't + * support IORING_FEAT_EXT_ARG, applications using this function must never + * set sqe->user_data to LIBURING_UDATA_TIMEOUT! + * + * For kernels without IORING_FEAT_EXT_ARG (5.10 and older), if 'ts' is + * specified, the application need not call io_uring_submit() before + * calling this function, as we will do that on its behalf. From this it also + * follows that this function isn't safe to use for applications that split SQ + * and CQ handling between two threads and expect that to work without + * synchronization, as this function manipulates both the SQ and CQ side. + * + * For kernels with IORING_FEAT_EXT_ARG, no implicit submission is done and + * hence this function is safe to use for applications that split SQ and CQ + * handling between two threads. + */ +func (ring *IoUring) __io_uring_submit_timeout(waitNr uint32, ts *syscall.Timespec) (ret int, err error) { + sqe := ring.io_uring_get_sqe() + if sqe == nil { + ret, err = ring.io_uringn_submit() + if err != nil { + return + } + sqe = ring.io_uring_get_sqe() + if sqe == nil { + err = syscall.EAGAIN + return + } + } + + PrepTimeout(sqe, ts, waitNr, 0) + sqe.UserData = LIBURING_UDATA_TIMEOUT + ret = int(ring.__io_uring_flush_sq()) + return +} + +func (ring *IoUring) io_uring_wait_cqes(cqePtr **IoUringCqe, waitNtr uint32, ts *syscall.Timespec, sigmask *Sigset_t) (err error) { + var toSubmit = 0 + if ts != nil { + if ring.Features&IORING_FEAT_EXT_ARG != 0 { + err = ring.io_uring_wait_cqes_new(cqePtr, waitNtr, ts, sigmask) + return + } + toSubmit, err = ring.__io_uring_submit_timeout(waitNtr, ts) + if err != nil { + return + } + } + err = ring.__io_uring_get_cqe(cqePtr, uint32(toSubmit), waitNtr, sigmask) + return +} + +func (ring *IoUring) io_uring_submit_and_wait_timeout(cqePtr **IoUringCqe, waitNtr uint32, ts *syscall.Timespec, sigmask *Sigset_t) (err error) { + var toSubmit int + if ts != nil { + if ring.Features&IORING_FEAT_EXT_ARG != 0 { + arg := IoUringGeteventsArg{ + Sigmask: uint64(uintptr(unsafe.Pointer(sigmask))), + SigmaskSz: NSIG / 8, + Ts: uint64(uintptr(unsafe.Pointer(ts))), + } + data := &get_data{ + submit: ring.__io_uring_flush_sq(), + waitNr: waitNtr, + getFlags: IORING_ENTER_EXT_ARG, + sz: int32(unsafe.Sizeof(arg)), + arg: unsafe.Pointer(&arg), + } + return ring._io_uring_get_cqe(cqePtr, data) + } + toSubmit, err = ring.__io_uring_submit_timeout(waitNtr, ts) + if err != nil { + return + } + } else { + toSubmit = int(ring.__io_uring_flush_sq()) + } + err = ring.__io_uring_get_cqe(cqePtr, uint32(toSubmit), waitNtr, sigmask) + return +} + +/* + * See io_uring_wait_cqes() - this function is the same, it just always uses + * '1' as the wait_nr. + */ +func (ring *IoUring) io_uring_wait_cqe_timeout(cqePtr **IoUringCqe, ts *syscall.Timespec) error { + return ring.io_uring_wait_cqes(cqePtr, 1, ts, nil) +} + +/* + * Submit sqes acquired from io_uring_get_sqe() to the kernel. + * + * Returns number of sqes submitted + */ +func (ring *IoUring) io_uringn_submit() (int, error) { + return ring.__io_uring_submit_and_wait(0) +} + +/* + * Like io_uring_submit(), but allows waiting for events as well. + * + * Returns number of sqes submitted + */ +func (ring *IoUring) io_uring_submit_and_wait(waitNtr uint32) (int, error) { + return ring.__io_uring_submit_and_wait(waitNtr) +} + +func (ring *IoUring) __io_uring_submit_and_wait(waitNr uint32) (int, error) { + return ring.__io_uring_submit(ring.__io_uring_flush_sq(), waitNr) +} + +func (ring *IoUring) __io_uring_submit(submitted uint32, waitNr uint32) (ret int, err error) { + var flags uint32 = 0 + + if ring.sq_ring_needs_enter(&flags) || waitNr != 0 { + if waitNr != 0 || ring.Flags&IORING_SETUP_IOPOLL != 0 { + flags |= IORING_ENTER_GETEVENTS + } + if ring.IntFlags&INT_FLAG_REG_RING != 0 { + flags |= IORING_ENTER_REGISTERED_RING + } + ret, err = io_uring_enter(ring.EnterRingFd, submitted, waitNr, flags, nil) + } else { + ret = int(submitted) + } + return +} + +func (ring *IoUring) io_uring_get_sqe() *IoUringSqe { + return ring._io_uring_get_sqe() +} + +/* + * Return an sqe to fill. Application must later call io_uring_submit() + * when it's ready to tell the kernel about it. The caller may call this + * function multiple times before calling io_uring_submit(). + * + * Returns a vacant sqe, or NULL if we're full. + */ +func (ring *IoUring) _io_uring_get_sqe() (sqe *IoUringSqe) { + sq := &ring.Sq + var head = atomic.LoadUint32(sq.Head) + var next = sq.SqeTail + 1 + var shift uint32 = 0 + + if ring.Flags&IORING_SETUP_SQE128 != 0 { + shift = 1 + } + + if next-head <= *sq.RingEntries { + sqe = ioUringSqeArray_Index(sq.Sqes, uintptr(sq.SqeTail&*sq.RingMask< 0 { + atomic.StoreUint32(ring.Cq.Head, *ring.Cq.Head+nr) + } +} diff --git a/queue_test.go b/queue_test.go new file mode 100644 index 0000000..d737722 --- /dev/null +++ b/queue_test.go @@ -0,0 +1,18 @@ +package gouring + +import ( + "fmt" + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestRingQueue(t *testing.T) { + h, err := New(256, 0) + assert.NoError(t, err) + defer h.Close() + + sqe := h.io_uring_get_sqe() + fmt.Printf("%+#v\n", sqe) + +} diff --git a/register.go b/register.go new file mode 100644 index 0000000..5b36c16 --- /dev/null +++ b/register.go @@ -0,0 +1,276 @@ +package gouring + +import ( + "syscall" + "unsafe" +) + +func (ring *IoUring) io_uring_register_buffers_update_tag(off uint32, + iov *syscall.Iovec, + tags []uint64, + nr uint32) error { + up := &IoUringRsrcUpdate2{ + Offset: off, + Data: uint64(uintptr(unsafe.Pointer(iov))), + Tags: uint64(uintptr(unsafe.Pointer(&tags[0]))), + Nr: nr, + } + + ret, err := io_uring_register(ring.RingFd, IORING_REGISTER_BUFFERS_UPDATE, + unsafe.Pointer(up), unsafe.Sizeof(*up)) + if err != nil { + return err + } + _ = ret + return nil +} + +func (ring *IoUring) io_uring_register_buffers_tags( + iov *syscall.Iovec, + tags []uint64, + nr uint32) error { + reg := &IoUringRsrcRegister{ + Nr: nr, + Data: uint64(uintptr(unsafe.Pointer(iov))), + Tags: uint64(uintptr(unsafe.Pointer(&tags[0]))), + } + ret, err := io_uring_register(ring.RingFd, IORING_REGISTER_BUFFERS2, + unsafe.Pointer(reg), unsafe.Sizeof(*reg)) + if err != nil { + return err + } + _ = ret + return nil +} + +func (ring *IoUring) io_uring_register_buffers_sparse(nr uint32) error { + reg := &IoUringRsrcRegister{ + Flags: IORING_RSRC_REGISTER_SPARSE, + Nr: nr, + } + ret, err := io_uring_register(ring.RingFd, IORING_RSRC_REGISTER_SPARSE, + unsafe.Pointer(reg), unsafe.Sizeof(*reg)) + if err != nil { + return err + } + _ = ret + return nil +} + +func (ring *IoUring) io_uring_register_buffers(iov *syscall.Iovec, nrIov uint32) int { + ret, err := io_uring_register(ring.RingFd, IORING_REGISTER_BUFFERS, + unsafe.Pointer(iov), uintptr(nrIov)) + if err != nil { + return 0 + } + return ret +} + +func (ring *IoUring) io_uring_unregister_buffers() int { + ret, err := io_uring_register(ring.RingFd, IORING_UNREGISTER_BUFFERS, nil, 0) + if err != nil { + return 0 + } + return ret +} + +func (ring *IoUring) io_uring_register_files_update_tag(off uint32, + files []int, tags []uint64, + nrFiles uint32) (int, error) { + up := &IoUringRsrcUpdate2{ + Offset: off, + Data: uint64(uintptr(unsafe.Pointer(&files[0]))), + Tags: uint64(uintptr(unsafe.Pointer(&tags[0]))), + Nr: nrFiles, + } + return io_uring_register(ring.RingFd, IORING_REGISTER_FILES_UPDATE2, + unsafe.Pointer(up), + unsafe.Sizeof(*up)) +} + +func (ring *IoUring) io_uring_register_files_update(off uint32, + files []int, nrFiles uint32) (int, error) { + up := &IoUringFilesUpdate{ + Offset: off, + Fds: uint64(uintptr(unsafe.Pointer(&files[0]))), + } + return io_uring_register(ring.RingFd, IORING_REGISTER_FILES_UPDATE, + unsafe.Pointer(up), uintptr(nrFiles)) +} + +func (ring *IoUring) io_uring_register_files_sparse(nr uint32) (ret int, err error) { + reg := &IoUringRsrcRegister{ + Flags: IORING_RSRC_REGISTER_SPARSE, + Nr: nr, + } + var didIncrease bool + for { + ret, err = io_uring_register(ring.RingFd, IORING_REGISTER_FILES2, + unsafe.Pointer(reg), + unsafe.Sizeof(*reg)) + if err == nil { + break + } + if err == syscall.EMFILE && !didIncrease { + increase_rlimit_nofile(uint64(nr)) + didIncrease = true + continue + } + break + } + return +} + +func (ring *IoUring) io_uring_register_files_tags( + files []int, + tags []uint64, nr uint32) (ret int, err error) { + reg := &IoUringRsrcRegister{ + Nr: nr, + Data: uint64(uintptr(unsafe.Pointer(&files[0]))), + Tags: uint64(uintptr(unsafe.Pointer(&tags[0]))), + } + var didIncrease bool + for { + ret, err = io_uring_register(ring.RingFd, IORING_REGISTER_FILES2, + unsafe.Pointer(reg), unsafe.Sizeof(*reg)) + if err == nil { + break + } + if err == syscall.EMFILE && !didIncrease { + increase_rlimit_nofile(uint64(nr)) + didIncrease = true + continue + } + break + } + return +} + +func (ring *IoUring) io_uring_register_files( + files []int, nrFiles uint32) (ret int, err error) { + var didIncrease bool + for { + ret, err = io_uring_register(ring.RingFd, IORING_REGISTER_FILES, + unsafe.Pointer(&files[0]), uintptr(nrFiles)) + if err == nil { + break + } + if err == syscall.EMFILE && !didIncrease { + increase_rlimit_nofile(uint64(nrFiles)) + didIncrease = true + continue + } + break + } + return +} + +func (ring *IoUring) io_uring_unregister_files() int { + ret, err := io_uring_register(ring.RingFd, IORING_UNREGISTER_FILES, nil, 0) + if err != nil { + return 0 + } + return ret +} + +func (ring *IoUring) io_uring_unregister_eventfd() int { + ret, err := io_uring_register(ring.RingFd, IORING_UNREGISTER_EVENTFD, nil, 0) + if err != nil { + return 0 + } + return ret +} + +func (ring *IoUring) io_uring_register_eventfd_async(eventFd int) int { + ret, err := io_uring_register(ring.RingFd, IORING_REGISTER_EVENTFD_ASYNC, nil, 0) + if err != nil { + return 0 + } + return ret +} + +func (ring *IoUring) io_uring_register_probe(p *IoUringProbe, nrOps uint32) int { + ret, err := io_uring_register(ring.RingFd, IORING_REGISTER_PROBE, + unsafe.Pointer(p), uintptr(nrOps)) + if err != nil { + return 0 + } + return ret +} + +func (ring *IoUring) io_uring_register_personality() (int, error) { + return io_uring_register(ring.RingFd, IORING_REGISTER_PERSONALITY, nil, 0) +} + +func (ring *IoUring) io_uring_unregister_personality(id int32) (int, error) { + return io_uring_register(ring.RingFd, IORING_UNREGISTER_PERSONALITY, nil, uintptr(id)) +} + +func (ring *IoUring) io_uring_register_restrictions(res *IoUringRestriction, nrRes uint32) int { + ret, err := io_uring_register(ring.RingFd, IORING_REGISTER_RESTRICTIONS, + unsafe.Pointer(res), uintptr(nrRes)) + if err != nil { + return 0 + } + return ret +} + +func (ring *IoUring) io_uring_enable_rings() error { + _, err := io_uring_register(ring.RingFd, IORING_REGISTER_ENABLE_RINGS, nil, 0) + return err +} + +// sched.h +// func io_uring_register_iowq_aff(ring *IoUring, cpuSz int, mask *CpuSet) { +// } +func (ring *IoUring) io_uring_unregister_iowq_aff() error { + _, err := io_uring_register(ring.RingFd, IORING_UNREGISTER_IOWQ_AFF, nil, 0) + return err +} + +func (ring *IoUring) io_uring_register_iowq_max_workers(val *uint32) (int, error) { + return io_uring_register(ring.RingFd, IORING_REGISTER_IOWQ_MAX_WORKERS, + unsafe.Pointer(val), 2) +} + +func (ring *IoUring) io_uring_register_ring_fd() (int, error) { + up := &IoUringRsrcUpdate{ + Data: uint64(ring.RingFd), + Offset: ^uint32(0), + } + ret, err := io_uring_register(ring.RingFd, IORING_REGISTER_RING_FDS, + unsafe.Pointer(up), 1) + if err != nil { + return 0, err + } + ring.EnterRingFd = int32(up.Offset) + ring.IntFlags |= INT_FLAG_REG_RING + return ret, nil +} + +func (ring *IoUring) io_uring_unregister_ring_fd() error { + up := &IoUringRsrcUpdate{ + Offset: uint32(ring.EnterRingFd), + } + ret, err := io_uring_register(ring.RingFd, IORING_UNREGISTER_RING_FDS, + unsafe.Pointer(up), 1) + if err != nil { + return err + } + if ret == 1 { + ring.EnterRingFd = ring.RingFd + ring.IntFlags &= ^INT_FLAG_REG_RING + } + return nil +} + +func (ring *IoUring) io_uring_register_buf_ring(reg *IoUringBufReg, flags uint32) (int, error) { + return io_uring_register(ring.RingFd, IORING_REGISTER_PBUF_RING, unsafe.Pointer(reg), 1) +} + +func (ring *IoUring) io_uring_unregister_buf_ring(bgId int32) (int, error) { + reg := &IoUringBufReg{ + Bgid: uint16(bgId), + } + return io_uring_register(ring.RingFd, IORING_UNREGISTER_PBUF_RING, unsafe.Pointer(reg), 1) +} diff --git a/setup.go b/setup.go new file mode 100644 index 0000000..ee5b76f --- /dev/null +++ b/setup.go @@ -0,0 +1,142 @@ +package gouring + +import ( + "syscall" + "unsafe" +) + +func io_uring_queue_init(entries uint32, ring *IoUring, flags uint32) error { + p := new(IoUringParams) + p.Flags = flags + return io_uring_queue_init_params(entries, ring, p) +} + +func io_uring_queue_init_params(entries uint32, ring *IoUring, p *IoUringParams) error { + fd, err := io_uring_setup(entries, p) + if err != nil { + return err + } + err = io_uring_queue_mmap(fd, p, ring) + if err != nil { + return err + } + ring.Features = p.Features + return nil +} + +func (ring *IoUring) io_uring_queue_exit() { + sq := &ring.Sq + cq := &ring.Cq + sqeSize := SizeofIoUringSqe + if ring.Flags&IORING_SETUP_SQE128 != 0 { + sqeSize += 64 + } + munmap(uintptr(unsafe.Pointer(sq.Sqes)), sqeSize*uintptr(*sq.RingEntries)) + io_uring_unmap_rings(sq, cq) + /* + * Not strictly required, but frees up the slot we used now rather + * than at process exit time. + */ + if ring.IntFlags&INT_FLAG_REG_RING != 0 { + ring.io_uring_unregister_ring_fd() + } + syscall.Close(int(ring.RingFd)) +} + +func io_uring_queue_mmap(fd int, p *IoUringParams, ring *IoUring) error { + err := io_uring_mmap(fd, p, &ring.Sq, &ring.Cq) + if err != nil { + ring.Flags = p.Flags + ring.RingFd = ring.EnterRingFd + ring.IntFlags = 0 + return err + } + return nil +} + +func io_uring_mmap(fd int, p *IoUringParams, sq *IoUringSq, cq *IoUringCq) (err error) { + + size := SizeofIoUringCqe + if p.Flags&IORING_SETUP_CQE32 != 0 { + size += SizeofIoUringCqe + } + + sq.RingSz = p.SqOff.Array + p.SqEntries*uint32(SizeofUnsigned) + cq.RingSz = p.CqOff.Cqes + p.CqEntries*uint32(size) + + if p.Features&IORING_FEAT_SINGLE_MMAP != 0 { + if cq.RingSz > sq.RingSz { + sq.RingSz = cq.RingSz + } + // cq.RingSz = sq.RingSz + } + // alloc sq ring + sq.RingPtr, err = mmap(0, uintptr(sq.RingSz), + syscall.PROT_READ|syscall.PROT_WRITE, + syscall.MAP_SHARED|syscall.MAP_POPULATE, + fd, IORING_OFF_SQ_RING) + if err != nil { + return + } + + if p.Features&IORING_FEAT_SINGLE_MMAP != 0 { + cq.RingPtr = sq.RingPtr + } else { + // alloc cq ring + cq.RingPtr, err = mmap(0, uintptr(cq.RingSz), + syscall.PROT_READ|syscall.PROT_WRITE, + syscall.MAP_SHARED|syscall.MAP_POPULATE, + fd, IORING_OFF_CQ_RING) + if err != nil { + // goto errLabel + io_uring_unmap_rings(sq, cq) + return + } + } + + //sq + sq.Head = (*uint32)(unsafe.Pointer(sq.RingPtr + uintptr(p.SqOff.Head))) + sq.Tail = (*uint32)(unsafe.Pointer(sq.RingPtr + uintptr(p.SqOff.Tail))) + sq.RingMask = (*uint32)(unsafe.Pointer(sq.RingPtr + uintptr(p.SqOff.RingMask))) + sq.RingEntries = (*uint32)(unsafe.Pointer(sq.RingPtr + uintptr(p.SqOff.RingEntries))) + sq.Flags = (*uint32)(unsafe.Pointer(sq.RingPtr + uintptr(p.SqOff.Flags))) + sq.Dropped = (*uint32)(unsafe.Pointer(sq.RingPtr + uintptr(p.SqOff.Dropped))) + sq.Array = (uint32Array)(unsafe.Pointer(sq.RingPtr + uintptr(p.SqOff.Array))) + + size = SizeofIoUringSqe + if p.Flags&IORING_SETUP_SQE128 != 0 { + size += 64 + } + var sqeAddr uintptr + sqeAddr, err = mmap(0, size*uintptr(p.SqEntries), + syscall.PROT_READ|syscall.PROT_WRITE, + syscall.MAP_SHARED|syscall.MAP_POPULATE, + fd, IORING_OFF_SQES) + if err != nil { + //errLabel: + io_uring_unmap_rings(sq, cq) + return + } + sq.Sqes = (ioUringSqeArray)(unsafe.Pointer(sqeAddr)) + + //cq + cq.Head = (*uint32)(unsafe.Pointer(cq.RingPtr + uintptr(p.CqOff.Head))) + cq.Tail = (*uint32)(unsafe.Pointer(cq.RingPtr + uintptr(p.CqOff.Tail))) + cq.RingMask = (*uint32)(unsafe.Pointer(cq.RingPtr + uintptr(p.CqOff.RingMask))) + cq.RingEntries = (*uint32)(unsafe.Pointer(cq.RingPtr + uintptr(p.CqOff.RingEntries))) + cq.Overflow = (*uint32)(unsafe.Pointer(cq.RingPtr + uintptr(p.CqOff.Overflow))) + cq.Cqes = (ioUringCqeArray)(unsafe.Pointer(cq.RingPtr + uintptr(p.CqOff.Cqes))) + if p.CqOff.Flags != 0 { + cq.Flags = (*uint32)(unsafe.Pointer(cq.RingPtr + uintptr(p.CqOff.Flags))) + } + + return nil +} + +func io_uring_unmap_rings(sq *IoUringSq, cq *IoUringCq) error { + munmap(sq.RingPtr, uintptr(sq.RingSz)) + if cq.RingPtr != 0 && cq.RingPtr != sq.RingPtr { + munmap(cq.RingPtr, uintptr(cq.RingSz)) + } + return nil +} diff --git a/sigset.go b/sigset.go new file mode 100644 index 0000000..0ad7376 --- /dev/null +++ b/sigset.go @@ -0,0 +1,19 @@ +package gouring + +import ( + "unsafe" +) + +const ( + SizeofUint64 = unsafe.Sizeof(uint64(0)) + SIGSET_NWORDS = (1024 / (8 * SizeofUint64)) + SIGTMIN = 32 + SIGTMAX = SIGTMIN + NSIG = (SIGTMAX + 1) +) + +type Sigset_t struct { + Val [SIGSET_NWORDS]uint64 +} + +// https://baike.baidu.com/item/sigset_t/4481187 diff --git a/syscall.go b/syscall.go new file mode 100644 index 0000000..9617e10 --- /dev/null +++ b/syscall.go @@ -0,0 +1,69 @@ +package gouring + +import ( + "syscall" + "unsafe" +) + +const ( + // uring syscall no. + + SYS_IO_URING_SETUP = 425 + SYS_IO_URING_ENTER = 426 + SYS_IO_URING_REGISTER = 427 +) + +func io_uring_setup(entries uint32, params *IoUringParams) (ret int, err error) { + r1, _, e1 := syscall.RawSyscall(SYS_IO_URING_SETUP, uintptr(entries), uintptr(unsafe.Pointer(params)), 0) + ret = int(r1) + if e1 != 0 { + err = e1 + } + return +} + +func io_uring_enter(fd int32, toSubmit uint32, minComplete uint32, flags uint32, sig *Sigset_t) (ret int, err error) { + return io_uring_enter2(fd, toSubmit, minComplete, flags, sig, NSIG/8) +} + +func io_uring_enter2(fd int32, toSubmit uint32, minComplete uint32, flags uint32, sig *Sigset_t, sz int32) (ret int, err error) { + r1, _, e1 := syscall.RawSyscall6(SYS_IO_URING_ENTER, + uintptr(fd), + uintptr(toSubmit), uintptr(minComplete), + uintptr(flags), uintptr(unsafe.Pointer(sig)), uintptr(sz)) + ret = int(r1) + if e1 != 0 { + err = e1 + } + return +} + +func io_uring_register(fd int32, opcode uint32, arg unsafe.Pointer, nrArgs uintptr) (ret int, err error) { + r1, _, e1 := syscall.RawSyscall6(SYS_IO_URING_REGISTER, uintptr(fd), uintptr(opcode), uintptr(arg), uintptr(nrArgs), 0, 0) + ret = int(r1) + if e1 != 0 { + err = e1 + } + return +} + +//go:linkname mmap syscall.mmap +func mmap(addr uintptr, length uintptr, prot int, flags int, fd int, offset int64) (xaddr uintptr, err error) + +//go:linkname munmap syscall.munmap +func munmap(addr uintptr, length uintptr) (err error) + +// + +func increase_rlimit_nofile(nr uint64) error { + var rlim syscall.Rlimit + err := syscall.Getrlimit(syscall.RLIMIT_NOFILE, &rlim) + if err != nil { + return err + } + if rlim.Cur < nr { + rlim.Cur += nr + err = syscall.Setrlimit(syscall.RLIMIT_NOFILE, &rlim) + } + return err +} diff --git a/uring.go b/uring.go new file mode 100644 index 0000000..f2222ff --- /dev/null +++ b/uring.go @@ -0,0 +1,23 @@ +package gouring + +func New(entries uint32, flags uint32) (*IoUring, error) { + ring := &IoUring{} + err := io_uring_queue_init(entries, ring, flags) + if err != nil { + return nil, err + } + return ring, nil +} + +func NewWithParamms(entries uint32, params *IoUringParams) (*IoUring, error) { + ring := &IoUring{} + err := io_uring_queue_init_params(entries, ring, params) + if err != nil { + return nil, err + } + return ring, nil +} + +func (h *IoUring) Close() { + h.io_uring_queue_exit() +} diff --git a/uring_test.go b/uring_test.go new file mode 100644 index 0000000..d3c4bd9 --- /dev/null +++ b/uring_test.go @@ -0,0 +1,15 @@ +package gouring + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestRingSetup(t *testing.T) { + h, err := New(256, 0) + assert.NoError(t, err) + assert.NotNil(t, h) + assert.NotEqual(t, 0, h.RingFd) + h.Close() +} diff --git a/util_ptr_arith.go b/util_ptr_arith.go new file mode 100644 index 0000000..9e2e2d7 --- /dev/null +++ b/util_ptr_arith.go @@ -0,0 +1,21 @@ +package gouring + +import "unsafe" + +type uint32Array *uint32 + +func uint32Array_Index(u uint32Array, i uintptr) *uint32 { + return (*uint32)(unsafe.Pointer(uintptr(unsafe.Pointer(u)) + SizeofUint32*i)) +} + +type ioUringSqeArray *IoUringSqe + +func ioUringSqeArray_Index(u ioUringSqeArray, i uintptr) *IoUringSqe { + return (*IoUringSqe)(unsafe.Pointer(uintptr(unsafe.Pointer(u)) + SizeofIoUringSqe*i)) +} + +type ioUringCqeArray *IoUringCqe + +func ioUringCqeArray_Index(u ioUringCqeArray, i uintptr) *IoUringCqe { + return (*IoUringCqe)(unsafe.Pointer(uintptr(unsafe.Pointer(u)) + SizeofIoUringCqe*i)) +} diff --git a/util_union.go b/util_union.go new file mode 100644 index 0000000..419cd53 --- /dev/null +++ b/util_union.go @@ -0,0 +1,87 @@ +package gouring + +import "unsafe" + +type union64 [8]byte + +func (u *union64) PutUnsafe(v unsafe.Pointer) { putUnsafe(unsafe.Pointer(u), v) } +func (u *union64) PutUintptr(v uintptr) { putUintptr(unsafe.Pointer(u), v) } +func (u *union64) PutUint64(v uint64) { putUint64(unsafe.Pointer(u), v) } +func (u *union64) PutUint32(v uint32) { putUint32(unsafe.Pointer(u), v) } +func (u *union64) PutUint16(v uint16) { putUint16(unsafe.Pointer(u), v) } +func (u *union64) PutUint8(v uint8) { putUint8(unsafe.Pointer(u), v) } + +func (u *union64) PutInt32(v int32) { putInt32(unsafe.Pointer(u), v) } + +func (u *union64) Unsafe() unsafe.Pointer { return unsafe.Pointer(u) } +func (u *union64) Uint64() uint64 { return *(*uint64)(unsafe.Pointer(u)) } +func (u *union64) Uint32() uint32 { return *(*uint32)(unsafe.Pointer(u)) } +func (u *union64) Uint16() uint16 { return *(*uint16)(unsafe.Pointer(u)) } +func (u *union64) Uint8() uint8 { return *(*uint8)(unsafe.Pointer(u)) } + +type union32 [4]byte + +func (u *union32) PutUnsafe(v unsafe.Pointer) { putUnsafe(unsafe.Pointer(u), v) } +func (u *union32) PutUintptr(v uintptr) { putUintptr(unsafe.Pointer(u), uintptr(uint32(v))) } +func (u *union32) PutUint64(v uint64) { putUint32(unsafe.Pointer(u), uint32(v)) } +func (u *union32) PutUint32(v uint32) { putUint32(unsafe.Pointer(u), v) } +func (u *union32) PutUint16(v uint16) { putUint16(unsafe.Pointer(u), v) } +func (u *union32) PutUint8(v uint8) { putUint8(unsafe.Pointer(u), v) } + +func (u *union32) PutInt32(v int32) { putInt32(unsafe.Pointer(u), v) } + +func (u *union32) Unsafe() unsafe.Pointer { return unsafe.Pointer(u) } +func (u *union32) Uint64() uint64 { return *(*uint64)(unsafe.Pointer(u)) } +func (u *union32) Uint32() uint32 { return *(*uint32)(unsafe.Pointer(u)) } +func (u *union32) Uint16() uint16 { return *(*uint16)(unsafe.Pointer(u)) } +func (u *union32) Uint8() uint8 { return *(*uint8)(unsafe.Pointer(u)) } + +type union16 [2]byte + +func (u *union16) PutUnsafe(v unsafe.Pointer) { putUnsafe(unsafe.Pointer(u), v) } +func (u *union16) PutUintptr(v uintptr) { putUintptr(unsafe.Pointer(u), uintptr(uint16(v))) } +func (u *union16) PutUint64(v uint64) { putUint16(unsafe.Pointer(u), uint16(v)) } +func (u *union16) PutUint32(v uint32) { putUint16(unsafe.Pointer(u), uint16(v)) } +func (u *union16) PutUint16(v uint16) { putUint16(unsafe.Pointer(u), v) } +func (u *union16) PutUint8(v uint8) { putUint8(unsafe.Pointer(u), v) } + +func (u *union16) Unsafe() unsafe.Pointer { return unsafe.Pointer(u) } +func (u *union16) Uint64() uint64 { return *(*uint64)(unsafe.Pointer(u)) } +func (u *union16) Uint32() uint32 { return *(*uint32)(unsafe.Pointer(u)) } +func (u *union16) Uint16() uint16 { return *(*uint16)(unsafe.Pointer(u)) } +func (u *union16) Uint8() uint8 { return *(*uint8)(unsafe.Pointer(u)) } + +// + +func putUnsafe(ptr unsafe.Pointer, v unsafe.Pointer) { + *(*unsafe.Pointer)(ptr) = v +} + +func putUintptr(ptr unsafe.Pointer, v uintptr) { + *(*uintptr)(ptr) = v +} +func putUint64(ptr unsafe.Pointer, v uint64) { + *(*uint64)(ptr) = v +} +func putUint32(ptr unsafe.Pointer, v uint32) { + *(*uint32)(ptr) = v +} +func putUint16(ptr unsafe.Pointer, v uint16) { + *(*uint16)(ptr) = v +} +func putUint8(ptr unsafe.Pointer, v uint8) { + *(*uint8)(ptr) = v +} + +func putInt64(ptr unsafe.Pointer, v int64) { + *(*int64)(ptr) = v +} +func putInt32(ptr unsafe.Pointer, v int32) { + *(*int32)(ptr) = v +} +func putInt16(ptr unsafe.Pointer, v int16) { + *(*int16)(ptr) = v +} +func putInt8(ptr unsafe.Pointer, v int8) { + *(*int8)(ptr) = v +}