Commit 60da8ef5 authored by Dan Melnic's avatar Dan Melnic Committed by Facebook Github Bot

Add iouring folly support, refactor the async IO

Summary: Add iouring folly support, refactor the async IO

Reviewed By: kevin-vigor

Differential Revision: D17834511

fbshipit-source-id: e20c876a32730549f305334fd5eed02cccf23638
parent b35bea8f
# Copyright (c) Facebook, Inc. and its affiliates.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
find_path(LIBURING_INCLUDE_DIR NAMES liburing.h)
mark_as_advanced(LIBURING_INCLUDE_DIR)
find_library(LIBURING_LIBRARY NAMES uring)
mark_as_advanced(LIBURING_LIBRARY)
include(FindPackageHandleStandardArgs)
FIND_PACKAGE_HANDLE_STANDARD_ARGS(
LIBURING
REQUIRED_VARS LIBURING_LIBRARY LIBURING_INCLUDE_DIR)
if(LIBURING_FOUND)
set(LIBURING_LIBRARIES ${LIBURING_LIBRARY})
set(LIBURING_INCLUDE_DIRS ${LIBURING_INCLUDE_DIR})
endif()
......@@ -113,6 +113,10 @@ find_package(LibAIO)
list(APPEND FOLLY_LINK_LIBRARIES ${LIBAIO_LIBRARIES})
list(APPEND FOLLY_INCLUDE_DIRECTORIES ${LIBAIO_INCLUDE_DIRS})
find_package(LibUring)
list(APPEND FOLLY_LINK_LIBRARIES ${LIBURING_LIBRARIES})
list(APPEND FOLLY_INCLUDE_DIRECTORIES ${LIBURING_INCLUDE_DIRS})
find_package(Libsodium)
list(APPEND FOLLY_LINK_LIBRARIES ${LIBSODIUM_LIBRARIES})
list(APPEND FOLLY_INCLUDE_DIRECTORIES ${LIBSODIUM_INCLUDE_DIRS})
......
......@@ -203,6 +203,23 @@ if (NOT ${LIBAIO_FOUND})
${FOLLY_DIR}/experimental/io/AsyncIO.h
)
endif()
if (NOT ${LIBURING_FOUND})
list(REMOVE_ITEM files
${FOLLY_DIR}/experimental/io/IoUring.cpp
)
list(REMOVE_ITEM hfiles
${FOLLY_DIR}/experimental/io/IoUring.h
)
endif()
if (NOT ${LIBAIO_FOUND} AND NOT ${LIBURING_FOUND})
list(REMOVE_ITEM files
${FOLLY_DIR}/experimental/io/AsyncBase.cpp
)
list(REMOVE_ITEM hfiles
${FOLLY_DIR}/experimental/io/AsyncBase.h
)
endif()
if (${LIBSODIUM_FOUND})
string(FIND "${CMAKE_LIBRARY_ARCHITECTURE}" "x86_64" IS_X86_64_ARCH)
if (${IS_X86_64_ARCH} STREQUAL "-1")
......
/*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <folly/experimental/io/AsyncBase.h>
#include <sys/eventfd.h>
#include <cerrno>
#include <ostream>
#include <stdexcept>
#include <string>
#include <boost/intrusive/parent_from_member.hpp>
#include <glog/logging.h>
#include <folly/Exception.h>
#include <folly/Format.h>
#include <folly/Likely.h>
#include <folly/String.h>
#include <folly/portability/Unistd.h>
namespace folly {
AsyncBaseOp::AsyncBaseOp(NotificationCallback cb)
: cb_(std::move(cb)), state_(State::UNINITIALIZED), result_(-EINVAL) {}
void AsyncBaseOp::reset(NotificationCallback cb) {
CHECK_NE(state_, State::PENDING);
cb_ = std::move(cb);
state_ = State::UNINITIALIZED;
result_ = -EINVAL;
}
AsyncBaseOp::~AsyncBaseOp() {
CHECK_NE(state_, State::PENDING);
}
void AsyncBaseOp::start() {
DCHECK_EQ(state_, State::INITIALIZED);
state_ = State::PENDING;
}
void AsyncBaseOp::complete(ssize_t result) {
DCHECK_EQ(state_, State::PENDING);
state_ = State::COMPLETED;
result_ = result;
if (cb_) {
cb_(this);
}
}
void AsyncBaseOp::cancel() {
DCHECK_EQ(state_, State::PENDING);
state_ = State::CANCELED;
}
ssize_t AsyncBaseOp::result() const {
CHECK_EQ(state_, State::COMPLETED);
return result_;
}
void AsyncBaseOp::init() {
CHECK_EQ(state_, State::UNINITIALIZED);
state_ = State::INITIALIZED;
}
std::string AsyncBaseOp::fd2name(int fd) {
std::string path = folly::to<std::string>("/proc/self/fd/", fd);
char link[PATH_MAX];
const ssize_t length =
std::max<ssize_t>(readlink(path.c_str(), link, PATH_MAX), 0);
return path.assign(link, length);
}
AsyncBase::AsyncBase(size_t capacity, PollMode pollMode) : capacity_(capacity) {
CHECK_GT(capacity_, 0);
completed_.reserve(capacity_);
if (pollMode == POLLABLE) {
pollFd_ = eventfd(0, EFD_NONBLOCK);
checkUnixError(pollFd_, "AsyncBase: eventfd creation failed");
}
}
AsyncBase::~AsyncBase() {
CHECK_EQ(pending_, 0);
if (pollFd_ != -1) {
CHECK_ERR(close(pollFd_));
}
}
void AsyncBase::decrementPending() {
auto p =
pending_.fetch_add(static_cast<size_t>(-1), std::memory_order_acq_rel);
DCHECK_GE(p, 1);
}
void AsyncBase::submit(Op* op) {
CHECK_EQ(op->state(), Op::State::INITIALIZED);
initializeContext(); // on demand
// We can increment past capacity, but we'll clean up after ourselves.
auto p = pending_.fetch_add(1, std::memory_order_acq_rel);
if (p >= capacity_) {
decrementPending();
throw std::range_error("AsyncBase: too many pending requests");
}
int rc = submitOne(op);
if (rc < 0) {
decrementPending();
throwSystemErrorExplicit(-rc, "AsyncBase: io_submit failed");
}
submitted_++;
DCHECK_EQ(rc, 1);
op->start();
}
Range<AsyncBase::Op**> AsyncBase::wait(size_t minRequests) {
CHECK(isInit());
CHECK_EQ(pollFd_, -1) << "wait() only allowed on non-pollable object";
auto p = pending_.load(std::memory_order_acquire);
CHECK_LE(minRequests, p);
return doWait(WaitType::COMPLETE, minRequests, p, completed_);
}
Range<AsyncBase::Op**> AsyncBase::cancel() {
CHECK(isInit());
auto p = pending_.load(std::memory_order_acquire);
return doWait(WaitType::CANCEL, p, p, canceled_);
}
Range<AsyncBase::Op**> AsyncBase::pollCompleted() {
CHECK(isInit());
CHECK_NE(pollFd_, -1) << "pollCompleted() only allowed on pollable object";
uint64_t numEvents;
// This sets the eventFd counter to 0, see
// http://www.kernel.org/doc/man-pages/online/pages/man2/eventfd.2.html
ssize_t rc;
do {
rc = ::read(pollFd_, &numEvents, 8);
} while (rc == -1 && errno == EINTR);
if (UNLIKELY(rc == -1 && errno == EAGAIN)) {
return Range<Op**>(); // nothing completed
}
checkUnixError(rc, "AsyncBase: read from event fd failed");
DCHECK_EQ(rc, 8);
DCHECK_GT(numEvents, 0);
DCHECK_LE(numEvents, pending_);
// Don't reap more than numEvents, as we've just reset the counter to 0.
return doWait(WaitType::COMPLETE, numEvents, numEvents, completed_);
}
AsyncBaseQueue::AsyncBaseQueue(AsyncBase* asyncBase) : asyncBase_(asyncBase) {}
AsyncBaseQueue::~AsyncBaseQueue() {
CHECK_EQ(asyncBase_->pending(), 0);
}
void AsyncBaseQueue::submit(AsyncBaseOp* op) {
submit([op]() { return op; });
}
void AsyncBaseQueue::submit(OpFactory op) {
queue_.push_back(op);
maybeDequeue();
}
void AsyncBaseQueue::onCompleted(AsyncBaseOp* /* op */) {
maybeDequeue();
}
void AsyncBaseQueue::maybeDequeue() {
while (!queue_.empty() && asyncBase_->pending() < asyncBase_->capacity()) {
auto& opFactory = queue_.front();
auto op = opFactory();
queue_.pop_front();
// Interpose our completion callback
auto& nextCb = op->notificationCallback();
op->setNotificationCallback([this, nextCb](AsyncBaseOp* op2) {
this->onCompleted(op2);
if (nextCb) {
nextCb(op2);
}
});
asyncBase_->submit(op);
}
}
// debugging helpers:
namespace {
#define X(c) \
case c: \
return #c
const char* asyncIoOpStateToString(AsyncBaseOp::State state) {
switch (state) {
X(AsyncBaseOp::State::UNINITIALIZED);
X(AsyncBaseOp::State::INITIALIZED);
X(AsyncBaseOp::State::PENDING);
X(AsyncBaseOp::State::COMPLETED);
X(AsyncBaseOp::State::CANCELED);
}
return "<INVALID AsyncBaseOp::State>";
}
#undef X
} // namespace
std::ostream& operator<<(std::ostream& os, const AsyncBaseOp& op) {
op.toStream(os);
return os;
}
std::ostream& operator<<(std::ostream& os, AsyncBaseOp::State state) {
return os << asyncIoOpStateToString(state);
}
} // namespace folly
/*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <sys/types.h>
#include <atomic>
#include <cstdint>
#include <deque>
#include <functional>
#include <iosfwd>
#include <mutex>
#include <utility>
#include <vector>
#include <folly/Portability.h>
#include <folly/Range.h>
#include <folly/portability/SysUio.h>
namespace folly {
class AsyncIOOp;
class IoUringOp;
/**
* An AsyncBaseOp represents a pending operation. You may set a notification
* callback or you may use this class's methods directly.
*
* The op must remain allocated until it is completed or canceled.
*/
class AsyncBaseOp {
friend class AsyncBase;
public:
using NotificationCallback = std::function<void(AsyncBaseOp*)>;
explicit AsyncBaseOp(NotificationCallback cb = NotificationCallback());
AsyncBaseOp(const AsyncBaseOp&) = delete;
AsyncBaseOp& operator=(const AsyncBaseOp&) = delete;
virtual ~AsyncBaseOp();
enum class State {
UNINITIALIZED,
INITIALIZED,
PENDING,
COMPLETED,
CANCELED,
};
/**
* Initiate a read request.
*/
virtual void pread(int fd, void* buf, size_t size, off_t start) = 0;
void pread(int fd, Range<unsigned char*> range, off_t start) {
pread(fd, range.begin(), range.size(), start);
}
virtual void preadv(int fd, const iovec* iov, int iovcnt, off_t start) = 0;
/**
* Initiate a write request.
*/
virtual void pwrite(int fd, const void* buf, size_t size, off_t start) = 0;
void pwrite(int fd, Range<const unsigned char*> range, off_t start) {
pwrite(fd, range.begin(), range.size(), start);
}
virtual void pwritev(int fd, const iovec* iov, int iovcnt, off_t start) = 0;
// we support only these subclasses
virtual AsyncIOOp* getAsyncIOOp() = 0;
virtual IoUringOp* getIoUringOp() = 0;
// ostream output
virtual void toStream(std::ostream& os) const = 0;
/**
* Return the current operation state.
*/
State state() const {
return state_;
}
/**
* user data get/set
*/
void* getUserData() const {
return userData_;
}
void setUserData(void* userData) {
userData_ = userData;
}
/**
* Reset the operation for reuse. It is an error to call reset() on
* an Op that is still pending.
*/
virtual void reset(NotificationCallback cb = NotificationCallback()) = 0;
void setNotificationCallback(NotificationCallback cb) {
cb_ = std::move(cb);
}
const NotificationCallback& notificationCallback() const {
return cb_;
}
/**
* Retrieve the result of this operation. Returns >=0 on success,
* -errno on failure (that is, using the Linux kernel error reporting
* conventions). Use checkKernelError (folly/Exception.h) on the result to
* throw a std::system_error in case of error instead.
*
* It is an error to call this if the Op hasn't completed.
*/
ssize_t result() const;
// debug helper
static std::string fd2name(int fd);
protected:
void init();
void start();
void complete(ssize_t result);
void cancel();
NotificationCallback cb_;
State state_;
ssize_t result_;
void* userData_{nullptr};
};
std::ostream& operator<<(std::ostream& stream, const AsyncBaseOp& op);
std::ostream& operator<<(std::ostream& stream, AsyncBaseOp::State state);
/**
* Generic C++ interface around Linux IO(io_submit, io_uring)
*/
class AsyncBase {
public:
using Op = AsyncBaseOp;
enum PollMode {
NOT_POLLABLE,
POLLABLE,
};
/**
* Create an AsyncBase context capable of holding at most 'capacity' pending
* requests at the same time. As requests complete, others can be scheduled,
* as long as this limit is not exceeded.
*
* If pollMode is POLLABLE, pollFd() will return a file descriptor that
* can be passed to poll / epoll / select and will become readable when
* any IOs on this AsyncBase have completed. If you do this, you must use
* pollCompleted() instead of wait() -- do not read from the pollFd()
* file descriptor directly.
*
* You may use the same AsyncBase object from multiple threads, as long as
* there is only one concurrent caller of wait() / pollCompleted() / cancel()
* (perhaps by always calling it from the same thread, or by providing
* appropriate mutual exclusion). In this case, pending() returns a snapshot
* of the current number of pending requests.
*/
explicit AsyncBase(size_t capacity, PollMode pollMode = NOT_POLLABLE);
AsyncBase(const AsyncBase&) = delete;
AsyncBase& operator=(const AsyncBase&) = delete;
virtual ~AsyncBase();
/**
* Wait for at least minRequests to complete. Returns the requests that
* have completed; the returned range is valid until the next call to
* wait(). minRequests may be 0 to not block.
*/
Range<Op**> wait(size_t minRequests);
/**
* Cancel all pending requests and return them; the returned range is
* valid until the next call to cancel().
*/
Range<Op**> cancel();
/**
* Return the number of pending requests.
*/
size_t pending() const {
return pending_;
}
/**
* Return the maximum number of requests that can be kept outstanding
* at any one time.
*/
size_t capacity() const {
return capacity_;
}
/**
* Return the accumulative number of submitted I/O, since this object
* has been created.
*/
size_t totalSubmits() const {
return submitted_;
}
/**
* If POLLABLE, return a file descriptor that can be passed to poll / epoll
* and will become readable when any async IO operations have completed.
* If NOT_POLLABLE, return -1.
*/
int pollFd() const {
return pollFd_;
}
/**
* If POLLABLE, call instead of wait after the file descriptor returned
* by pollFd() became readable. The returned range is valid until the next
* call to pollCompleted().
*/
Range<Op**> pollCompleted();
/**
* Submit an op for execution.
*/
void submit(Op* op);
protected:
void complete(Op* op, ssize_t result) {
op->complete(result);
}
void cancel(Op* op) {
op->cancel();
}
bool isInit() const {
return init_.load(std::memory_order_relaxed);
}
void decrementPending();
virtual void initializeContext() = 0;
virtual int submitOne(AsyncBase::Op* op) = 0;
enum class WaitType { COMPLETE, CANCEL };
virtual Range<AsyncBase::Op**> doWait(
WaitType type,
size_t minRequests,
size_t maxRequests,
std::vector<Op*>& result) = 0;
std::atomic<bool> init_{false};
std::mutex initMutex_;
std::atomic<size_t> pending_{0};
std::atomic<size_t> submitted_{0};
const size_t capacity_;
int pollFd_{-1};
std::vector<Op*> completed_;
std::vector<Op*> canceled_;
};
/**
* Wrapper around AsyncBase that allows you to schedule more requests than
* the AsyncBase's object capacity. Other requests are queued and processed
* in a FIFO order.
*/
class AsyncBaseQueue {
public:
/**
* Create a queue, using the given AsyncBase object.
* The AsyncBase object may not be used by anything else until the
* queue is destroyed.
*/
explicit AsyncBaseQueue(AsyncBase* asyncBase);
~AsyncBaseQueue();
size_t queued() const {
return queue_.size();
}
/**
* Submit an op to the AsyncBase queue. The op will be queued until
* the AsyncBase object has room.
*/
void submit(AsyncBaseOp* op);
/**
* Submit a delayed op to the AsyncBase queue; this allows you to postpone
* creation of the Op (which may require allocating memory, etc) until
* the AsyncBase object has room.
*/
using OpFactory = std::function<AsyncBaseOp*()>;
void submit(OpFactory op);
private:
void onCompleted(AsyncBaseOp* op);
void maybeDequeue();
AsyncBase* asyncBase_;
std::deque<OpFactory> queue_;
};
} // namespace folly
This diff is collapsed.
......@@ -16,267 +16,80 @@
#pragma once
#include <sys/types.h>
#include <atomic>
#include <cstdint>
#include <deque>
#include <functional>
#include <iosfwd>
#include <mutex>
#include <utility>
#include <vector>
#include <libaio.h>
#include <folly/Portability.h>
#include <folly/Range.h>
#include <folly/portability/SysUio.h>
#include <folly/experimental/io/AsyncBase.h>
namespace folly {
/**
* An AsyncIOOp represents a pending operation. You may set a notification
* callback or you may use this class's methods directly.
*
* The op must remain allocated until it is completed or canceled.
*/
class AsyncIOOp {
class AsyncIOOp : public AsyncBaseOp {
friend class AsyncIO;
friend std::ostream& operator<<(std::ostream& stream, const AsyncIOOp& o);
public:
typedef std::function<void(AsyncIOOp*)> NotificationCallback;
explicit AsyncIOOp(NotificationCallback cb = NotificationCallback());
AsyncIOOp(const AsyncIOOp&) = delete;
AsyncIOOp& operator=(const AsyncIOOp&) = delete;
~AsyncIOOp();
enum class State {
UNINITIALIZED,
INITIALIZED,
PENDING,
COMPLETED,
CANCELED,
};
~AsyncIOOp() override;
/**
* Initiate a read request.
*/
void pread(int fd, void* buf, size_t size, off_t start);
void pread(int fd, Range<unsigned char*> range, off_t start);
void preadv(int fd, const iovec* iov, int iovcnt, off_t start);
void pread(int fd, void* buf, size_t size, off_t start) override;
void preadv(int fd, const iovec* iov, int iovcnt, off_t start) override;
/**
* Initiate a write request.
*/
void pwrite(int fd, const void* buf, size_t size, off_t start);
void pwrite(int fd, Range<const unsigned char*> range, off_t start);
void pwritev(int fd, const iovec* iov, int iovcnt, off_t start);
/**
* Return the current operation state.
*/
State state() const {
return state_;
}
void pwrite(int fd, const void* buf, size_t size, off_t start) override;
void pwritev(int fd, const iovec* iov, int iovcnt, off_t start) override;
/**
* Reset the operation for reuse. It is an error to call reset() on
* an Op that is still pending.
*/
void reset(NotificationCallback cb = NotificationCallback());
void reset(NotificationCallback cb = NotificationCallback()) override;
void setNotificationCallback(NotificationCallback cb) {
cb_ = std::move(cb);
AsyncIOOp* getAsyncIOOp() override {
return this;
}
const NotificationCallback& notificationCallback() const {
return cb_;
IoUringOp* getIoUringOp() override {
return nullptr;
}
/**
* Retrieve the result of this operation. Returns >=0 on success,
* -errno on failure (that is, using the Linux kernel error reporting
* conventions). Use checkKernelError (folly/Exception.h) on the result to
* throw a std::system_error in case of error instead.
*
* It is an error to call this if the Op hasn't completed.
*/
ssize_t result() const;
void toStream(std::ostream& os) const override;
private:
void init();
void start();
void complete(ssize_t result);
void cancel();
NotificationCallback cb_;
iocb iocb_;
State state_;
ssize_t result_;
};
std::ostream& operator<<(std::ostream& stream, const AsyncIOOp& o);
std::ostream& operator<<(std::ostream& stream, AsyncIOOp::State state);
std::ostream& operator<<(std::ostream& stream, const AsyncIOOp& op);
/**
* C++ interface around Linux Async IO.
*/
class AsyncIO {
class AsyncIO : public AsyncBase {
public:
typedef AsyncIOOp Op;
enum PollMode {
NOT_POLLABLE,
POLLABLE,
};
using Op = AsyncIOOp;
/**
* Create an AsyncIO context capable of holding at most 'capacity' pending
* requests at the same time. As requests complete, others can be scheduled,
* as long as this limit is not exceeded.
*
* Note: the maximum number of allowed concurrent requests is controlled
* by the fs.aio-max-nr sysctl, the default value is usually 64K.
*
* If pollMode is POLLABLE, pollFd() will return a file descriptor that
* can be passed to poll / epoll / select and will become readable when
* any IOs on this AsyncIO have completed. If you do this, you must use
* pollCompleted() instead of wait() -- do not read from the pollFd()
* file descriptor directly.
*
* You may use the same AsyncIO object from multiple threads, as long as
* there is only one concurrent caller of wait() / pollCompleted() / cancel()
* (perhaps by always calling it from the same thread, or by providing
* appropriate mutual exclusion). In this case, pending() returns a snapshot
* of the current number of pending requests.
*/
explicit AsyncIO(size_t capacity, PollMode pollMode = NOT_POLLABLE);
AsyncIO(const AsyncIO&) = delete;
AsyncIO& operator=(const AsyncIO&) = delete;
~AsyncIO();
/**
* Wait for at least minRequests to complete. Returns the requests that
* have completed; the returned range is valid until the next call to
* wait(). minRequests may be 0 to not block.
*/
Range<Op**> wait(size_t minRequests);
/**
* Cancel all pending requests and return them; the returned range is
* valid until the next call to cancel().
*/
Range<Op**> cancel();
/**
* Return the number of pending requests.
*/
size_t pending() const {
return pending_;
}
/**
* Return the maximum number of requests that can be kept outstanding
* at any one time.
*/
size_t capacity() const {
return capacity_;
}
/**
* Return the accumulative number of submitted I/O, since this object
* has been created.
*/
size_t totalSubmits() const {
return submitted_;
}
/**
* If POLLABLE, return a file descriptor that can be passed to poll / epoll
* and will become readable when any async IO operations have completed.
* If NOT_POLLABLE, return -1.
*/
int pollFd() const {
return pollFd_;
}
/**
* If POLLABLE, call instead of wait after the file descriptor returned
* by pollFd() became readable. The returned range is valid until the next
* call to pollCompleted().
*/
Range<Op**> pollCompleted();
/**
* Submit an op for execution.
*/
void submit(Op* op);
~AsyncIO() override;
private:
void decrementPending();
void initializeContext();
void initializeContext() override;
int submitOne(AsyncBase::Op* op) override;
enum class WaitType { COMPLETE, CANCEL };
Range<AsyncIO::Op**> doWait(
Range<AsyncBase::Op**> doWait(
WaitType type,
size_t minRequests,
size_t maxRequests,
std::vector<Op*>& result);
std::vector<AsyncBase::Op*>& result) override;
io_context_t ctx_{nullptr};
std::atomic<bool> ctxSet_{false};
std::mutex initMutex_;
std::atomic<size_t> pending_{0};
std::atomic<size_t> submitted_{0};
const size_t capacity_;
int pollFd_{-1};
std::vector<Op*> completed_;
std::vector<Op*> canceled_;
};
/**
* Wrapper around AsyncIO that allows you to schedule more requests than
* the AsyncIO's object capacity. Other requests are queued and processed
* in a FIFO order.
*/
class AsyncIOQueue {
public:
/**
* Create a queue, using the given AsyncIO object.
* The AsyncIO object may not be used by anything else until the
* queue is destroyed.
*/
explicit AsyncIOQueue(AsyncIO* asyncIO);
~AsyncIOQueue();
size_t queued() const {
return queue_.size();
}
/**
* Submit an op to the AsyncIO queue. The op will be queued until
* the AsyncIO object has room.
*/
void submit(AsyncIOOp* op);
/**
* Submit a delayed op to the AsyncIO queue; this allows you to postpone
* creation of the Op (which may require allocating memory, etc) until
* the AsyncIO object has room.
*/
typedef std::function<AsyncIOOp*()> OpFactory;
void submit(OpFactory op);
private:
void onCompleted(AsyncIOOp* op);
void maybeDequeue();
AsyncIO* asyncIO_;
std::deque<OpFactory> queue_;
};
using AsyncIOQueue = AsyncBaseQueue;
} // namespace folly
/*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <folly/experimental/io/IoUring.h>
#include <sys/eventfd.h>
#include <cerrno>
#include <ostream>
#include <stdexcept>
#include <string>
#include <boost/intrusive/parent_from_member.hpp>
#include <glog/logging.h>
#include <folly/Exception.h>
#include <folly/Format.h>
#include <folly/Likely.h>
#include <folly/String.h>
#include <folly/portability/Unistd.h>
// helpers
namespace {
// http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
uint32_t roundUpToNextPowerOfTwo(uint32_t num) {
if (num == 0) {
return 0;
}
num--;
num |= num >> 1;
num |= num >> 2;
num |= num >> 4;
num |= num >> 8;
num |= num >> 16;
return num + 1;
}
#define X(c) \
case c: \
return #c
const char* ioUringOpToString(unsigned char op) {
switch (op) {
X(IORING_OP_NOP);
X(IORING_OP_READV);
X(IORING_OP_WRITEV);
X(IORING_OP_FSYNC);
X(IORING_OP_READ_FIXED);
X(IORING_OP_WRITE_FIXED);
X(IORING_OP_POLL_ADD);
X(IORING_OP_POLL_REMOVE);
X(IORING_OP_SYNC_FILE_RANGE);
X(IORING_OP_SENDMSG);
X(IORING_OP_RECVMSG);
X(IORING_OP_TIMEOUT);
};
return "<INVALID op>";
}
#undef X
void toStream(std::ostream& os, const struct io_uring_sqe& sqe) {
os << folly::format(
"user_data={}, opcode={}, ioprio={}, f={}, ",
sqe.user_data,
ioUringOpToString(sqe.opcode),
sqe.ioprio,
folly::AsyncBaseOp::fd2name(sqe.fd));
switch (sqe.opcode) {
case IORING_OP_READV:
case IORING_OP_WRITEV: {
auto offset = sqe.off;
auto* iovec = reinterpret_cast<struct iovec*>(sqe.addr);
os << "{";
for (unsigned int i = 0; i < sqe.len; i++) {
if (i) {
os << ",";
}
os << folly::format(
"buf={}, offset={}, nbytes={}",
iovec[i].iov_base,
offset,
iovec[i].iov_len);
// advance the offset
offset += iovec[i].iov_len;
}
os << "}";
} break;
default:
os << "[TODO: write debug string for " << ioUringOpToString(sqe.opcode)
<< "] ";
break;
}
}
} // namespace
namespace folly {
IoUringOp::IoUringOp(NotificationCallback cb) : AsyncBaseOp(std::move(cb)) {}
void IoUringOp::reset(NotificationCallback cb) {
CHECK_NE(state_, State::PENDING);
cb_ = std::move(cb);
state_ = State::UNINITIALIZED;
result_ = -EINVAL;
}
IoUringOp::~IoUringOp() {}
void IoUringOp::pread(int fd, void* buf, size_t size, off_t start) {
init();
iov_[0].iov_base = buf;
iov_[0].iov_len = size;
io_uring_prep_readv(&sqe_, fd, iov_, 1, start);
io_uring_sqe_set_data(&sqe_, this);
}
void IoUringOp::preadv(int fd, const iovec* iov, int iovcnt, off_t start) {
init();
io_uring_prep_readv(&sqe_, fd, iov, iovcnt, start);
io_uring_sqe_set_data(&sqe_, this);
}
void IoUringOp::pwrite(int fd, const void* buf, size_t size, off_t start) {
init();
iov_[0].iov_base = const_cast<void*>(buf);
iov_[0].iov_len = size;
io_uring_prep_writev(&sqe_, fd, iov_, 1, start);
io_uring_sqe_set_data(&sqe_, this);
}
void IoUringOp::pwritev(int fd, const iovec* iov, int iovcnt, off_t start) {
init();
io_uring_prep_writev(&sqe_, fd, iov, iovcnt, start);
io_uring_sqe_set_data(&sqe_, this);
}
void IoUringOp::toStream(std::ostream& os) const {
os << "{" << state_ << ", ";
if (state_ != AsyncBaseOp::State::UNINITIALIZED) {
::toStream(os, sqe_);
}
if (state_ == AsyncBaseOp::State::COMPLETED) {
os << "result=" << result_;
if (result_ < 0) {
os << " (" << errnoStr(-result_) << ')';
}
os << ", ";
}
os << "}";
}
std::ostream& operator<<(std::ostream& os, const IoUringOp& op) {
op.toStream(os);
return os;
}
IoUring::IoUring(size_t capacity, PollMode pollMode, size_t maxSubmit)
: AsyncBase(capacity, pollMode),
maxSubmit_((maxSubmit <= capacity) ? maxSubmit : capacity) {
::memset(&ioRing_, 0, sizeof(ioRing_));
::memset(&params_, 0, sizeof(params_));
params_.flags |= IORING_SETUP_CQSIZE;
params_.cq_entries = roundUpToNextPowerOfTwo(capacity_);
}
IoUring::~IoUring() {
CHECK_EQ(pending_, 0);
if (ioRing_.ring_fd > 0) {
::io_uring_queue_exit(&ioRing_);
ioRing_.ring_fd = -1;
}
}
bool IoUring::isAvailable() {
IoUring ioUring(1);
try {
ioUring.initializeContext();
} catch (...) {
return false;
}
return true;
}
void IoUring::initializeContext() {
if (!init_.load(std::memory_order_acquire)) {
std::lock_guard<std::mutex> lock(initMutex_);
if (!init_.load(std::memory_order_relaxed)) {
int rc = ::io_uring_queue_init_params(
roundUpToNextPowerOfTwo(maxSubmit_), &ioRing_, &params_);
checkKernelError(rc, "IoUring: io_uring_queue_init_params failed");
DCHECK_GT(ioRing_.ring_fd, 0);
if (pollFd_ != -1) {
CHECK_ERR(io_uring_register_eventfd(&ioRing_, pollFd_));
}
init_.store(true, std::memory_order_release);
}
}
}
int IoUring::submitOne(AsyncBase::Op* op) {
// -1 return here will trigger throw if op isn't an IoUringOp
IoUringOp* iop = op->getIoUringOp();
if (!iop) {
return -1;
}
SharedMutex::WriteHolder lk(submitMutex_);
auto* sqe = io_uring_get_sqe(&ioRing_);
if (!sqe) {
return -1;
}
*sqe = iop->getSqe();
return io_uring_submit(&ioRing_);
}
Range<AsyncBase::Op**> IoUring::doWait(
WaitType type,
size_t minRequests,
size_t maxRequests,
std::vector<AsyncBase::Op*>& result) {
result.clear();
size_t count = 0;
while (count < maxRequests) {
struct io_uring_cqe* cqe = nullptr;
if (!io_uring_peek_cqe(&ioRing_, &cqe) && cqe) {
count++;
Op* op = reinterpret_cast<Op*>(io_uring_cqe_get_data(cqe));
CHECK(op);
auto res = cqe->res;
io_uring_cqe_seen(&ioRing_, cqe);
decrementPending();
switch (type) {
case WaitType::COMPLETE:
op->complete(res);
break;
case WaitType::CANCEL:
op->cancel();
break;
}
result.push_back(op);
} else {
if (count < minRequests) {
io_uring_enter(
ioRing_.ring_fd,
0,
minRequests - count,
IORING_ENTER_GETEVENTS,
nullptr);
} else {
break;
}
}
}
return range(result);
}
} // namespace folly
/*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
extern "C" {
#include <liburing.h>
}
#include <folly/SharedMutex.h>
#include <folly/experimental/io/AsyncBase.h>
namespace folly {
/**
* An IoUringOp represents a pending operation. You may set a notification
* callback or you may use this class's methods directly.
*
* The op must remain allocated until it is completed or canceled.
*/
class IoUringOp : public AsyncBaseOp {
friend class IoUring;
friend std::ostream& operator<<(std::ostream& stream, const IoUringOp& o);
public:
explicit IoUringOp(NotificationCallback cb = NotificationCallback());
IoUringOp(const IoUringOp&) = delete;
IoUringOp& operator=(const IoUringOp&) = delete;
~IoUringOp() override;
/**
* Initiate a read request.
*/
void pread(int fd, void* buf, size_t size, off_t start) override;
void preadv(int fd, const iovec* iov, int iovcnt, off_t start) override;
/**
* Initiate a write request.
*/
void pwrite(int fd, const void* buf, size_t size, off_t start) override;
void pwritev(int fd, const iovec* iov, int iovcnt, off_t start) override;
void reset(NotificationCallback cb = NotificationCallback()) override;
AsyncIOOp* getAsyncIOOp() override {
return nullptr;
}
IoUringOp* getIoUringOp() override {
return this;
}
void toStream(std::ostream& os) const override;
const struct io_uring_sqe& getSqe() const {
return sqe_;
}
private:
struct io_uring_sqe sqe_;
struct iovec iov_[1];
};
std::ostream& operator<<(std::ostream& stream, const IoUringOp& op);
/**
* C++ interface around Linux io_uring
*/
class IoUring : public AsyncBase {
public:
using Op = IoUringOp;
/**
* Note: the maximum number of allowed concurrent requests is controlled
* by the kernel IORING_MAX_ENTRIES and the memlock limit,
* The default IORING_MAX_ENTRIES value is usually 32K.
*/
explicit IoUring(
size_t capacity,
PollMode pollMode = NOT_POLLABLE,
size_t maxSubmit = 1);
IoUring(const IoUring&) = delete;
IoUring& operator=(const IoUring&) = delete;
~IoUring() override;
static bool isAvailable();
private:
void initializeContext() override;
int submitOne(AsyncBase::Op* op) override;
Range<AsyncBase::Op**> doWait(
WaitType type,
size_t minRequests,
size_t maxRequests,
std::vector<AsyncBase::Op*>& result) override;
size_t maxSubmit_;
struct io_uring_params params_;
struct io_uring ioRing_;
SharedMutex submitMutex_;
};
using IoUringQueue = AsyncBaseQueue;
} // namespace folly
/*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <folly/experimental/io/test/AsyncBaseTestLib.h>
namespace folly {
namespace test {
void TestUtil::waitUntilReadable(int fd) {
pollfd pfd;
pfd.fd = fd;
pfd.events = POLLIN;
int r;
do {
r = poll(&pfd, 1, -1); // wait forever
} while (r == -1 && errno == EINTR);
PCHECK(r == 1);
CHECK_EQ(pfd.revents, POLLIN); // no errors etc
}
folly::Range<folly::AsyncBase::Op**> TestUtil::readerWait(
folly::AsyncBase* reader) {
int fd = reader->pollFd();
if (fd == -1) {
return reader->wait(1);
} else {
waitUntilReadable(fd);
return reader->pollCompleted();
}
}
TestUtil::ManagedBuffer TestUtil::allocateAligned(size_t size) {
void* buf;
int rc = posix_memalign(&buf, kAlign, size);
CHECK_EQ(rc, 0) << folly::errnoStr(rc);
return ManagedBuffer(reinterpret_cast<char*>(buf), free);
}
TemporaryFile::TemporaryFile(size_t size)
: path_(folly::fs::temp_directory_path() / folly::fs::unique_path()) {
CHECK_EQ(size % sizeof(uint32_t), 0);
size /= sizeof(uint32_t);
const uint32_t seed = 42;
std::mt19937 rnd(seed);
const size_t bufferSize = 1U << 16;
uint32_t buffer[bufferSize];
FILE* fp = ::fopen(path_.c_str(), "wb");
PCHECK(fp != nullptr);
while (size) {
size_t n = std::min(size, bufferSize);
for (size_t i = 0; i < n; ++i) {
buffer[i] = rnd();
}
size_t written = ::fwrite(buffer, sizeof(uint32_t), n, fp);
PCHECK(written == n);
size -= written;
}
PCHECK(::fclose(fp) == 0);
}
TemporaryFile::~TemporaryFile() {
try {
folly::fs::remove(path_);
} catch (const folly::fs::filesystem_error& e) {
LOG(ERROR) << "fs::remove: " << folly::exceptionStr(e);
}
}
TemporaryFile& TemporaryFile::getTempFile() {
static TemporaryFile sTempFile(6 << 20); // 6MiB
return sTempFile;
}
} // namespace test
} // namespace folly
This diff is collapsed.
This diff is collapsed.
/*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <folly/experimental/io/IoUring.h>
#include <folly/experimental/io/test/AsyncBaseTestLib.h>
#include <folly/init/Init.h>
using folly::IoUring;
namespace folly {
namespace test {
INSTANTIATE_TYPED_TEST_CASE_P(AsyncTest, AsyncTest, IoUring);
} // namespace test
} // namespace folly
int main(int argc, char** argv) {
::testing::InitGoogleTest(&argc, argv);
folly::init(&argc, &argv);
bool avail = IoUring::isAvailable();
if (!avail) {
LOG(INFO)
<< "Not running tests since this kernel version does not support io_uring";
return 0;
}
return RUN_ALL_TESTS();
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment