1//! This module contains specializations that can offload `io::copy()` operations on file descriptor
2//! containing types (`File`, `TcpStream`, etc.) to more efficient syscalls than `read(2)` and `write(2)`.
3//!
4//! Specialization is only applied to wholly std-owned types so that user code can't observe
5//! that the `Read` and `Write` traits are not used.
6//!
7//! Since a copy operation involves a reader and writer side where each can consist of different types
8//! and also involve generic wrappers (e.g. `Take`, `BufReader`) it is not practical to specialize
9//! a single method on all possible combinations.
10//!
11//! Instead readers and writers are handled separately by the `CopyRead` and `CopyWrite` specialization
12//! traits and then specialized on by the `Copier::copy` method.
13//!
14//! `Copier` uses the specialization traits to unpack the underlying file descriptors and
15//! additional prerequisites and constraints imposed by the wrapper types.
16//!
17//! Once it has obtained all necessary pieces and brought any wrapper types into a state where they
18//! can be safely bypassed it will attempt to use the `copy_file_range(2)`,
19//! `sendfile(2)` or `splice(2)` syscalls to move data directly between file descriptors.
20//! Since those syscalls have requirements that cannot be fully checked in advance it attempts
21//! to use them one after another (guided by hints) to figure out which one works and
22//! falls back to the generic read-write copy loop if none of them does.
23//! Once a working syscall is found for a pair of file descriptors it will be called in a loop
24//! until the copy operation is completed.
25//!
26//! Advantages of using these syscalls:
27//!
28//! * fewer context switches since reads and writes are coalesced into a single syscall
29//! and more bytes are transferred per syscall. This translates to higher throughput
30//! and fewer CPU cycles, at least for sufficiently large transfers to amortize the initial probing.
31//! * `copy_file_range` creates reflink copies on CoW filesystems, thus moving less data and
32//! consuming less disk space
33//! * `sendfile` and `splice` can perform zero-copy IO under some circumstances while
34//! a naive copy loop would move every byte through the CPU.
35//!
36//! Drawbacks:
37//!
38//! * copy operations smaller than the default buffer size can under some circumstances, especially
39//! on older kernels, incur more syscalls than the naive approach would. As mentioned above
40//! the syscall selection is guided by hints to minimize this possibility but they are not perfect.
41//! * optimizations only apply to std types. If a user adds a custom wrapper type, e.g. to report
42//! progress, they can hit a performance cliff.
43//! * complexity
4445#[cfg(not(any(all(target_os = "linux", target_env = "gnu"), target_os = "hurd")))]
46use libc::sendfile as sendfile64;
47#[cfg(any(all(target_os = "linux", target_env = "gnu"), target_os = "hurd"))]
48use libc::sendfile64;
49use libc::{EBADF, EINVAL, ENOSYS, EOPNOTSUPP, EOVERFLOW, EPERM, EXDEV};
5051use super::CopyState;
52use crate::cmp::min;
53use crate::fs::{File, Metadata};
54use crate::io::{
55BufRead, BufReader, BufWriter, Error, PipeReader, PipeWriter, Read, Result, StderrLock,
56StdinLock, StdoutLock, Take, Write,
57};
58use crate::mem::ManuallyDrop;
59use crate::net::TcpStream;
60use crate::os::unix::fs::FileTypeExt;
61use crate::os::unix::io::{AsRawFd, FromRawFd, RawFd};
62use crate::os::unix::net::UnixStream;
63use crate::process::{ChildStderr, ChildStdin, ChildStdout};
64use crate::ptr;
65use crate::sync::atomic::{Atomic, AtomicBool, AtomicU8, Ordering};
66use crate::sys::cvt;
67use crate::sys::fs::CachedFileMetadata;
68use crate::sys::weak::syscall;
6970#[cfg(test)]
71mod tests;
7273pub fn kernel_copy<R: Read + ?Sized, W: Write + ?Sized>(
74 read: &mut R,
75 write: &mut W,
76) -> Result<CopyState> {
77let copier = Copier { read, write };
78 SpecCopy::copy(copier)
79}
8081/// This type represents either the inferred `FileType` of a `RawFd` based on the source
82/// type from which it was extracted or the actual metadata
83///
84/// The methods on this type only provide hints, due to `AsRawFd` and `FromRawFd` the inferred
85/// type may be wrong.
86enum FdMeta {
87 Metadata(Metadata),
88 Socket,
89 Pipe,
90/// We don't have any metadata because the stat syscall failed
91NoneObtained,
92}
9394#[derive(#[automatically_derived]
impl ::core::cmp::PartialEq for FdHandle {
#[inline]
fn eq(&self, other: &FdHandle) -> bool {
let __self_discr = ::core::intrinsics::discriminant_value(self);
let __arg1_discr = ::core::intrinsics::discriminant_value(other);
__self_discr == __arg1_discr
}
}PartialEq)]
95enum FdHandle {
96 Input,
97 Output,
98}
99100impl FdMeta {
101fn maybe_fifo(&self) -> bool {
102match self {
103 FdMeta::Metadata(meta) => meta.file_type().is_fifo(),
104 FdMeta::Socket => false,
105 FdMeta::Pipe => true,
106 FdMeta::NoneObtained => true,
107 }
108 }
109110fn potential_sendfile_source(&self) -> bool {
111match self {
112// procfs erroneously shows 0 length on non-empty readable files.
113 // and if a file is truly empty then a `read` syscall will determine that and skip the write syscall
114 // thus there would be benefit from attempting sendfile
115FdMeta::Metadata(meta)
116if meta.file_type().is_file() && meta.len() > 0
117|| meta.file_type().is_block_device() =>
118 {
119true
120}
121_ => false,
122 }
123 }
124125fn copy_file_range_candidate(&self, f: FdHandle) -> bool {
126match self {
127// copy_file_range will fail on empty procfs files. `read` can determine whether EOF has been reached
128 // without extra cost and skip the write, thus there is no benefit in attempting copy_file_range
129FdMeta::Metadata(meta) if f == FdHandle::Input && meta.is_file() && meta.len() > 0 => {
130true
131}
132 FdMeta::Metadata(meta) if f == FdHandle::Output && meta.is_file() => true,
133_ => false,
134 }
135 }
136}
137138/// Returns true either if changes made to the source after a sendfile/splice call won't become
139/// visible in the sink or the source has explicitly opted into such behavior (e.g. by splicing
140/// a file into a pipe, the pipe being the source in this case).
141///
142/// This will prevent File -> Pipe and File -> Socket splicing/sendfile optimizations to uphold
143/// the Read/Write API semantics of io::copy.
144///
145/// Note: This is not 100% airtight, the caller can use the RawFd conversion methods to turn a
146/// regular file into a TcpSocket which will be treated as a socket here without checking.
147fn safe_kernel_copy(source: &FdMeta, sink: &FdMeta) -> bool {
148match (source, sink) {
149// Data arriving from a socket is safe because the sender can't modify the socket buffer.
150 // Data arriving from a pipe is safe(-ish) because either the sender *copied*
151 // the bytes into the pipe OR explicitly performed an operation that enables zero-copy,
152 // thus promising not to modify the data later.
153(FdMeta::Socket, _) => true,
154 (FdMeta::Pipe, _) => true,
155 (FdMeta::Metadata(meta), _)
156if meta.file_type().is_fifo() || meta.file_type().is_socket() =>
157 {
158true
159}
160// Data going into non-pipes/non-sockets is safe because the "later changes may become visible" issue
161 // only happens for pages sitting in send buffers or pipes.
162(_, FdMeta::Metadata(meta))
163if !meta.file_type().is_fifo() && !meta.file_type().is_socket() =>
164 {
165true
166}
167_ => false,
168 }
169}
170171struct CopyParams(FdMeta, Option<RawFd>);
172173struct Copier<'a, 'b, R: Read + ?Sized, W: Write + ?Sized> {
174 read: &'a mut R,
175 write: &'b mut W,
176}
177178trait SpecCopy {
179fn copy(self) -> Result<CopyState>;
180}
181182impl<R: Read + ?Sized, W: Write + ?Sized> SpecCopyfor Copier<'_, '_, R, W> {
183 default fn copy(self) -> Result<CopyState> {
184Ok(CopyState::Fallback(0))
185 }
186}
187188impl<R: CopyRead, W: CopyWrite> SpecCopyfor Copier<'_, '_, R, W> {
189fn copy(self) -> Result<CopyState> {
190let (reader, writer) = (self.read, self.write);
191let r_cfg = reader.properties();
192let w_cfg = writer.properties();
193194// before direct operations on file descriptors ensure that all source and sink buffers are empty
195let mut flush = || -> Result<u64> {
196let bytes = reader.drain_to(writer, u64::MAX)?;
197// BufWriter buffered bytes have already been accounted for in earlier write() calls
198writer.flush()?;
199Ok(bytes)
200 };
201202let mut written = 0u64;
203204if let (CopyParams(input_meta, Some(readfd)), CopyParams(output_meta, Some(writefd))) =
205 (r_cfg, w_cfg)
206 {
207written += flush()?;
208let max_write = reader.min_limit();
209210if input_meta.copy_file_range_candidate(FdHandle::Input)
211 && output_meta.copy_file_range_candidate(FdHandle::Output)
212 {
213let result = copy_regular_files(readfd, writefd, max_write);
214result.update_take(reader);
215216match result {
217 CopyResult::Ended(bytes_copied) => {
218return Ok(CopyState::Ended(bytes_copied + written));
219 }
220 CopyResult::Error(e, _) => return Err(e),
221 CopyResult::Fallback(bytes) => written += bytes,
222 }
223 }
224225// on modern kernels sendfile can copy from any mmapable type (some but not all regular files and block devices)
226 // to any writable file descriptor. On older kernels the writer side can only be a socket.
227 // So we just try and fallback if needed.
228 // If current file offsets + write sizes overflow it may also fail, we do not try to fix that and instead
229 // fall back to the generic copy loop.
230if input_meta.potential_sendfile_source() && safe_kernel_copy(&input_meta, &output_meta)
231 {
232let result = sendfile_splice(SpliceMode::Sendfile, readfd, writefd, max_write);
233result.update_take(reader);
234235match result {
236 CopyResult::Ended(bytes_copied) => {
237return Ok(CopyState::Ended(bytes_copied + written));
238 }
239 CopyResult::Error(e, _) => return Err(e),
240 CopyResult::Fallback(bytes) => written += bytes,
241 }
242 }
243244if (input_meta.maybe_fifo() || output_meta.maybe_fifo())
245 && safe_kernel_copy(&input_meta, &output_meta)
246 {
247let result = sendfile_splice(SpliceMode::Splice, readfd, writefd, max_write);
248result.update_take(reader);
249250match result {
251 CopyResult::Ended(bytes_copied) => {
252return Ok(CopyState::Ended(bytes_copied + written));
253 }
254 CopyResult::Error(e, _) => return Err(e),
255 CopyResult::Fallback(0) => { /* use the fallback below */ }
256 CopyResult::Fallback(_) => {
257{
::core::panicking::panic_fmt(format_args!("internal error: entered unreachable code: {0}",
format_args!("splice should not return > 0 bytes on the fallback path")));
}unreachable!("splice should not return > 0 bytes on the fallback path")258 }
259 }
260 }
261 }
262263// fallback if none of the more specialized syscalls wants to work with these file descriptors
264Ok(CopyState::Fallback(written))
265 }
266}
267268#[rustc_specialization_trait]
269trait CopyRead: Read {
270/// Implementations that contain buffers (i.e. `BufReader`) must transfer data from their internal
271 /// buffers into `writer` until either the buffers are emptied or `limit` bytes have been
272 /// transferred, whichever occurs sooner.
273 /// If nested buffers are present the outer buffers must be drained first.
274 ///
275 /// This is necessary to directly bypass the wrapper types while preserving the data order
276 /// when operating directly on the underlying file descriptors.
277fn drain_to<W: Write>(&mut self, _writer: &mut W, _limit: u64) -> Result<u64> {
278Ok(0)
279 }
280281/// Updates `Take` wrappers to remove the number of bytes copied.
282fn taken(&mut self, _bytes: u64) {}
283284/// The minimum of the limit of all `Take<_>` wrappers, `u64::MAX` otherwise.
285 /// This method does not account for data `BufReader` buffers and would underreport
286 /// the limit of a `Take<BufReader<Take<_>>>` type. Thus its result is only valid
287 /// after draining the buffers via `drain_to`.
288fn min_limit(&self) -> u64 {
289u64::MAX290 }
291292/// Extracts the file descriptor and hints/metadata, delegating through wrappers if necessary.
293fn properties(&self) -> CopyParams;
294}
295296#[rustc_specialization_trait]
297trait CopyWrite: Write {
298/// Extracts the file descriptor and hints/metadata, delegating through wrappers if necessary.
299fn properties(&self) -> CopyParams;
300}
301302impl<T> CopyReadfor &mut T
303where
304T: CopyRead,
305{
306fn drain_to<W: Write>(&mut self, writer: &mut W, limit: u64) -> Result<u64> {
307 (**self).drain_to(writer, limit)
308 }
309310fn taken(&mut self, bytes: u64) {
311 (**self).taken(bytes);
312 }
313314fn min_limit(&self) -> u64 {
315 (**self).min_limit()
316 }
317318fn properties(&self) -> CopyParams {
319 (**self).properties()
320 }
321}
322323impl<T> CopyWritefor &mut T
324where
325T: CopyWrite,
326{
327fn properties(&self) -> CopyParams {
328 (**self).properties()
329 }
330}
331332impl CopyReadfor File {
333fn properties(&self) -> CopyParams {
334CopyParams(fd_to_meta(self), Some(self.as_raw_fd()))
335 }
336}
337338impl CopyReadfor &File {
339fn properties(&self) -> CopyParams {
340CopyParams(fd_to_meta(*self), Some(self.as_raw_fd()))
341 }
342}
343344impl CopyWritefor File {
345fn properties(&self) -> CopyParams {
346CopyParams(fd_to_meta(self), Some(self.as_raw_fd()))
347 }
348}
349350impl CopyWritefor &File {
351fn properties(&self) -> CopyParams {
352CopyParams(fd_to_meta(*self), Some(self.as_raw_fd()))
353 }
354}
355356impl CopyReadfor TcpStream {
357fn properties(&self) -> CopyParams {
358// avoid the stat syscall since we can be fairly sure it's a socket
359CopyParams(FdMeta::Socket, Some(self.as_raw_fd()))
360 }
361}
362363impl CopyReadfor &TcpStream {
364fn properties(&self) -> CopyParams {
365// avoid the stat syscall since we can be fairly sure it's a socket
366CopyParams(FdMeta::Socket, Some(self.as_raw_fd()))
367 }
368}
369370impl CopyWritefor TcpStream {
371fn properties(&self) -> CopyParams {
372// avoid the stat syscall since we can be fairly sure it's a socket
373CopyParams(FdMeta::Socket, Some(self.as_raw_fd()))
374 }
375}
376377impl CopyWritefor &TcpStream {
378fn properties(&self) -> CopyParams {
379// avoid the stat syscall since we can be fairly sure it's a socket
380CopyParams(FdMeta::Socket, Some(self.as_raw_fd()))
381 }
382}
383384impl CopyReadfor UnixStream {
385fn properties(&self) -> CopyParams {
386// avoid the stat syscall since we can be fairly sure it's a socket
387CopyParams(FdMeta::Socket, Some(self.as_raw_fd()))
388 }
389}
390391impl CopyReadfor &UnixStream {
392fn properties(&self) -> CopyParams {
393// avoid the stat syscall since we can be fairly sure it's a socket
394CopyParams(FdMeta::Socket, Some(self.as_raw_fd()))
395 }
396}
397398impl CopyWritefor UnixStream {
399fn properties(&self) -> CopyParams {
400// avoid the stat syscall since we can be fairly sure it's a socket
401CopyParams(FdMeta::Socket, Some(self.as_raw_fd()))
402 }
403}
404405impl CopyWritefor &UnixStream {
406fn properties(&self) -> CopyParams {
407// avoid the stat syscall since we can be fairly sure it's a socket
408CopyParams(FdMeta::Socket, Some(self.as_raw_fd()))
409 }
410}
411412impl CopyReadfor PipeReader {
413fn properties(&self) -> CopyParams {
414CopyParams(FdMeta::Pipe, Some(self.as_raw_fd()))
415 }
416}
417418impl CopyReadfor &PipeReader {
419fn properties(&self) -> CopyParams {
420CopyParams(FdMeta::Pipe, Some(self.as_raw_fd()))
421 }
422}
423424impl CopyWritefor PipeWriter {
425fn properties(&self) -> CopyParams {
426CopyParams(FdMeta::Pipe, Some(self.as_raw_fd()))
427 }
428}
429430impl CopyWritefor &PipeWriter {
431fn properties(&self) -> CopyParams {
432CopyParams(FdMeta::Pipe, Some(self.as_raw_fd()))
433 }
434}
435436impl CopyWritefor ChildStdin {
437fn properties(&self) -> CopyParams {
438CopyParams(FdMeta::Pipe, Some(self.as_raw_fd()))
439 }
440}
441442impl CopyReadfor ChildStdout {
443fn properties(&self) -> CopyParams {
444CopyParams(FdMeta::Pipe, Some(self.as_raw_fd()))
445 }
446}
447448impl CopyReadfor ChildStderr {
449fn properties(&self) -> CopyParams {
450CopyParams(FdMeta::Pipe, Some(self.as_raw_fd()))
451 }
452}
453454impl CopyReadfor StdinLock<'_> {
455fn drain_to<W: Write>(&mut self, writer: &mut W, outer_limit: u64) -> Result<u64> {
456let buf_reader = self.as_mut_buf();
457let buf = buf_reader.buffer();
458let buf = &buf[0..min(buf.len(), outer_limit.try_into().unwrap_or(usize::MAX))];
459let bytes_drained = buf.len();
460 writer.write_all(buf)?;
461buf_reader.consume(bytes_drained);
462463Ok(bytes_drainedas u64)
464 }
465466fn properties(&self) -> CopyParams {
467CopyParams(fd_to_meta(self), Some(self.as_raw_fd()))
468 }
469}
470471impl CopyWritefor StdoutLock<'_> {
472fn properties(&self) -> CopyParams {
473CopyParams(fd_to_meta(self), Some(self.as_raw_fd()))
474 }
475}
476477impl CopyWritefor StderrLock<'_> {
478fn properties(&self) -> CopyParams {
479CopyParams(fd_to_meta(self), Some(self.as_raw_fd()))
480 }
481}
482483impl<T: CopyRead> CopyReadfor Take<T> {
484fn drain_to<W: Write>(&mut self, writer: &mut W, outer_limit: u64) -> Result<u64> {
485let local_limit = self.limit();
486let combined_limit = min(outer_limit, local_limit);
487let bytes_drained = self.get_mut().drain_to(writer, combined_limit)?;
488// update limit since read() was bypassed
489self.set_limit(local_limit - bytes_drained);
490491Ok(bytes_drained)
492 }
493494fn taken(&mut self, bytes: u64) {
495self.set_limit(self.limit() - bytes);
496self.get_mut().taken(bytes);
497 }
498499fn min_limit(&self) -> u64 {
500min(Take::limit(self), self.get_ref().min_limit())
501 }
502503fn properties(&self) -> CopyParams {
504self.get_ref().properties()
505 }
506}
507508impl<T: ?Sized + CopyRead> CopyReadfor BufReader<T> {
509fn drain_to<W: Write>(&mut self, writer: &mut W, outer_limit: u64) -> Result<u64> {
510let buf = self.buffer();
511let buf = &buf[0..min(buf.len(), outer_limit.try_into().unwrap_or(usize::MAX))];
512let bytes = buf.len();
513 writer.write_all(buf)?;
514self.consume(bytes);
515516let remaining = outer_limit - bytesas u64;
517518// in case of nested bufreaders we also need to drain the ones closer to the source
519let inner_bytes = self.get_mut().drain_to(writer, remaining)?;
520521Ok(bytesas u64 + inner_bytes)
522 }
523524fn taken(&mut self, bytes: u64) {
525self.get_mut().taken(bytes);
526 }
527528fn min_limit(&self) -> u64 {
529self.get_ref().min_limit()
530 }
531532fn properties(&self) -> CopyParams {
533self.get_ref().properties()
534 }
535}
536537impl<T: ?Sized + CopyWrite> CopyWritefor BufWriter<T> {
538fn properties(&self) -> CopyParams {
539self.get_ref().properties()
540 }
541}
542543impl CopyReadfor CachedFileMetadata {
544fn properties(&self) -> CopyParams {
545CopyParams(FdMeta::Metadata(self.1.clone()), Some(self.0.as_raw_fd()))
546 }
547}
548549impl CopyWritefor CachedFileMetadata {
550fn properties(&self) -> CopyParams {
551CopyParams(FdMeta::Metadata(self.1.clone()), Some(self.0.as_raw_fd()))
552 }
553}
554555fn fd_to_meta<T: AsRawFd>(fd: &T) -> FdMeta {
556let fd = fd.as_raw_fd();
557let file: ManuallyDrop<File> = ManuallyDrop::new(unsafe { File::from_raw_fd(fd) });
558match file.metadata() {
559Ok(meta) => FdMeta::Metadata(meta),
560Err(_) => FdMeta::NoneObtained,
561 }
562}
563564enum CopyResult {
565 Ended(u64),
566 Error(Error, u64),
567 Fallback(u64),
568}
569570impl CopyResult {
571fn update_take(&self, reader: &mut impl CopyRead) {
572match *self {
573 CopyResult::Fallback(bytes)
574 | CopyResult::Ended(bytes)
575 | CopyResult::Error(_, bytes) => reader.taken(bytes),
576 }
577 }
578}
579580/// Invalid file descriptor.
581///
582/// Valid file descriptors are guaranteed to be positive numbers (see `open()` manpage)
583/// while negative values are used to indicate errors.
584/// Thus -1 will never be overlap with a valid open file.
585const INVALID_FD: RawFd = -1;
586587/// Linux-specific implementation that will attempt to use copy_file_range for copy offloading.
588/// As the name says, it only works on regular files.
589///
590/// Callers must handle fallback to a generic copy loop.
591/// `Fallback` may indicate non-zero number of bytes already written
592/// if one of the files' cursor +`max_len` would exceed u64::MAX (`EOVERFLOW`).
593fn copy_regular_files(reader: RawFd, writer: RawFd, max_len: u64) -> CopyResult {
594use crate::cmp;
595596const NOT_PROBED: u8 = 0;
597const UNAVAILABLE: u8 = 1;
598const AVAILABLE: u8 = 2;
599600// Kernel prior to 4.5 don't have copy_file_range
601 // We store the availability in a global to avoid unnecessary syscalls
602static HAS_COPY_FILE_RANGE: Atomic<u8> = AtomicU8::new(NOT_PROBED);
603604let mut have_probed = match HAS_COPY_FILE_RANGE.load(Ordering::Relaxed) {
605NOT_PROBED => false,
606UNAVAILABLE => return CopyResult::Fallback(0),
607_ => true,
608 };
609610unsafe fn copy_file_range(fd_in: libc::c_int, off_in: *mut libc::loff_t,
fd_out: libc::c_int, off_out: *mut libc::loff_t, len: libc::size_t,
flags: libc::c_uint) -> libc::ssize_t {
let ref copy_file_range:
ExternWeak<unsafe extern "C" fn(libc::c_int, *mut libc::loff_t,
libc::c_int, *mut libc::loff_t, libc::size_t, libc::c_uint)
-> libc::ssize_t> =
{
unsafe extern "C" {
#[linkage = "extern_weak"]
static copy_file_range:
Option<unsafe extern "C" fn(libc::c_int, *mut libc::loff_t,
libc::c_int, *mut libc::loff_t, libc::size_t, libc::c_uint)
-> libc::ssize_t>;
}
#[allow(unused_unsafe)]
ExternWeak::new(unsafe { copy_file_range })
};
if let Some(fun) = copy_file_range.get() {
unsafe { fun(fd_in, off_in, fd_out, off_out, len, flags) }
} else {
unsafe {
libc::syscall(libc::SYS_copy_file_range, fd_in, off_in, fd_out,
off_out, len, flags) as libc::ssize_t
}
}
}syscall!(
611fn copy_file_range(
612 fd_in: libc::c_int,
613 off_in: *mut libc::loff_t,
614 fd_out: libc::c_int,
615 off_out: *mut libc::loff_t,
616 len: libc::size_t,
617 flags: libc::c_uint,
618 ) -> libc::ssize_t;
619 );
620621fn probe_copy_file_range_support() -> u8 {
622// In some cases, we cannot determine availability from the first
623 // `copy_file_range` call. In this case, we probe with an invalid file
624 // descriptor so that the results are easily interpretable.
625match unsafe {
626cvt(copy_file_range(INVALID_FD, ptr::null_mut(), INVALID_FD, ptr::null_mut(), 1, 0))
627 .map_err(|e| e.raw_os_error())
628 } {
629Err(Some(EPERM | ENOSYS)) => UNAVAILABLE,
630Err(Some(EBADF)) => AVAILABLE,
631Ok(_) => {
::core::panicking::panic_fmt(format_args!("unexpected copy_file_range probe success"));
}panic!("unexpected copy_file_range probe success"),
632// Treat other errors as the syscall
633 // being unavailable.
634Err(_) => UNAVAILABLE,
635 }
636 }
637638let mut written = 0u64;
639while written < max_len {
640let bytes_to_copy = cmp::min(max_len - written, usize::MAX as u64);
641// cap to 1GB chunks in case u64::MAX is passed as max_len and the file has a non-zero seek position
642 // this allows us to copy large chunks without hitting EOVERFLOW,
643 // unless someone sets a file offset close to u64::MAX - 1GB, in which case a fallback would be required
644let bytes_to_copy = cmp::min(bytes_to_copy as usize, 0x4000_0000usize);
645let copy_result = unsafe {
646// We actually don't have to adjust the offsets,
647 // because copy_file_range adjusts the file offset automatically
648cvt(copy_file_range(reader, ptr::null_mut(), writer, ptr::null_mut(), bytes_to_copy, 0))
649 };
650651if !have_probed && copy_result.is_ok() {
652 have_probed = true;
653 HAS_COPY_FILE_RANGE.store(AVAILABLE, Ordering::Relaxed);
654 }
655656match copy_result {
657Ok(0) if written == 0 => {
658// fallback to work around several kernel bugs where copy_file_range will fail to
659 // copy any bytes and return 0 instead of an error if
660 // - reading virtual files from the proc filesystem which appear to have 0 size
661 // but are not empty. noted in coreutils to affect kernels at least up to 5.6.19.
662 // - copying from an overlay filesystem in docker. reported to occur on fedora 32.
663return CopyResult::Fallback(0);
664 }
665Ok(0) => return CopyResult::Ended(written), // reached EOF
666Ok(ret) => written += ret as u64,
667Err(err) => {
668return match err.raw_os_error() {
669// when file offset + max_length > u64::MAX
670Some(EOVERFLOW) => CopyResult::Fallback(written),
671Some(raw_os_error @ (ENOSYS | EXDEV | EINVAL | EPERM | EOPNOTSUPP | EBADF))
672if written == 0 =>
673 {
674if !have_probed {
675let available = if #[allow(non_exhaustive_omitted_patterns)] match raw_os_error {
ENOSYS | EOPNOTSUPP | EPERM => true,
_ => false,
}matches!(raw_os_error, ENOSYS | EOPNOTSUPP | EPERM) {
676// EPERM can indicate seccomp filters or an
677 // immutable file. To distinguish these
678 // cases we probe with invalid file
679 // descriptors which should result in EBADF
680 // if the syscall is supported and EPERM or
681 // ENOSYS if it's not available.
682 //
683 // For EOPNOTSUPP, see below. In the case of
684 // ENOSYS, we try to cover for faulty FUSE
685 // drivers.
686probe_copy_file_range_support()
687 } else {
688 AVAILABLE
689 };
690 HAS_COPY_FILE_RANGE.store(available, Ordering::Relaxed);
691 }
692693// Try fallback io::copy if either:
694 // - Kernel version is < 4.5 (ENOSYS¹)
695 // - Files are mounted on different fs (EXDEV)
696 // - copy_file_range is broken in various ways on RHEL/CentOS 7 (EOPNOTSUPP)
697 // - copy_file_range file is immutable or syscall is blocked by seccomp¹ (EPERM)
698 // - copy_file_range cannot be used with pipes or device nodes (EINVAL)
699 // - the writer fd was opened with O_APPEND (EBADF²)
700 // and no bytes were written successfully yet. (All these errnos should
701 // not be returned if something was already written, but they happen in
702 // the wild, see #91152.)
703 //
704 // ¹ these cases should be detected by the initial probe but we handle them here
705 // anyway in case syscall interception changes during runtime
706 // ² actually invalid file descriptors would cause this too, but in that case
707 // the fallback code path is expected to encounter the same error again
708CopyResult::Fallback(0)
709 }
710_ => CopyResult::Error(err, written),
711 };
712 }
713 }
714 }
715 CopyResult::Ended(written)
716}
717718#[derive(#[automatically_derived]
impl ::core::cmp::PartialEq for SpliceMode {
#[inline]
fn eq(&self, other: &SpliceMode) -> bool {
let __self_discr = ::core::intrinsics::discriminant_value(self);
let __arg1_discr = ::core::intrinsics::discriminant_value(other);
__self_discr == __arg1_discr
}
}PartialEq)]
719enum SpliceMode {
720 Sendfile,
721 Splice,
722}
723724/// performs splice or sendfile between file descriptors
725/// Does _not_ fall back to a generic copy loop.
726fn sendfile_splice(mode: SpliceMode, reader: RawFd, writer: RawFd, len: u64) -> CopyResult {
727static HAS_SENDFILE: Atomic<bool> = AtomicBool::new(true);
728static HAS_SPLICE: Atomic<bool> = AtomicBool::new(true);
729730// Android builds use feature level 14, but the libc wrapper for splice is
731 // gated on feature level 21+, so we have to invoke the syscall directly.
732#[cfg(target_os = "android")]
733syscall!(
734fn splice(
735 srcfd: libc::c_int,
736 src_offset: *const i64,
737 dstfd: libc::c_int,
738 dst_offset: *const i64,
739 len: libc::size_t,
740 flags: libc::c_int,
741 ) -> libc::ssize_t;
742 );
743744#[cfg(target_os = "linux")]
745use libc::splice;
746747match mode {
748 SpliceMode::Sendfileif !HAS_SENDFILE.load(Ordering::Relaxed) => {
749return CopyResult::Fallback(0);
750 }
751 SpliceMode::Spliceif !HAS_SPLICE.load(Ordering::Relaxed) => {
752return CopyResult::Fallback(0);
753 }
754_ => (),
755 }
756757let mut written = 0u64;
758while written < len {
759// according to its manpage that's the maximum size sendfile() will copy per invocation
760let chunk_size = crate::cmp::min(len - written, 0x7ffff000_u64) as usize;
761762let result = match mode {
763 SpliceMode::Sendfile => {
764 cvt(unsafe { sendfile64(writer, reader, ptr::null_mut(), chunk_size) })
765 }
766 SpliceMode::Splice => cvt(unsafe {
767 splice(reader, ptr::null_mut(), writer, ptr::null_mut(), chunk_size, 0)
768 }),
769 };
770771match result {
772Ok(0) => break, // EOF
773Ok(ret) => written += ret as u64,
774Err(err) => {
775return match err.raw_os_error() {
776Some(ENOSYS | EPERM) => {
777// syscall not supported (ENOSYS)
778 // syscall is disallowed, e.g. by seccomp (EPERM)
779match mode {
780 SpliceMode::Sendfile => HAS_SENDFILE.store(false, Ordering::Relaxed),
781 SpliceMode::Splice => HAS_SPLICE.store(false, Ordering::Relaxed),
782 }
783match (&written, &0) {
(left_val, right_val) => {
if !(*left_val == *right_val) {
let kind = ::core::panicking::AssertKind::Eq;
::core::panicking::assert_failed(kind, &*left_val, &*right_val,
::core::option::Option::None);
}
}
};assert_eq!(written, 0);
784 CopyResult::Fallback(0)
785 }
786Some(EINVAL) => {
787// splice/sendfile do not support this particular file descriptor (EINVAL)
788match (&written, &0) {
(left_val, right_val) => {
if !(*left_val == *right_val) {
let kind = ::core::panicking::AssertKind::Eq;
::core::panicking::assert_failed(kind, &*left_val, &*right_val,
::core::option::Option::None);
}
}
};assert_eq!(written, 0);
789 CopyResult::Fallback(0)
790 }
791Some(os_err) if mode == SpliceMode::Sendfile && os_err == EOVERFLOW => {
792 CopyResult::Fallback(written)
793 }
794_ => CopyResult::Error(err, written),
795 };
796 }
797 }
798 }
799 CopyResult::Ended(written)
800}