From 227061b9c9e17eb1d253c4786b180f2c71e7028c Mon Sep 17 00:00:00 2001 From: Robert Nelson Date: Thu, 23 Jul 2020 14:53:36 -0500 Subject: [PATCH 001/133] merge: can-isotp: https://github.com/hartkopp/can-isotp https://github.com/hartkopp/can-isotp/commit/ced84cab120a1ddde2bd4561a18514707c8a6184 Signed-off-by: Robert Nelson --- include/uapi/linux/can/isotp.h | 135 +++ net/can/isotp.c | 1537 ++++++++++++++++++++++++++++++++ 2 files changed, 1672 insertions(+) create mode 100644 include/uapi/linux/can/isotp.h create mode 100644 net/can/isotp.c diff --git a/include/uapi/linux/can/isotp.h b/include/uapi/linux/can/isotp.h new file mode 100644 index 0000000000000..548e732f33a8e --- /dev/null +++ b/include/uapi/linux/can/isotp.h @@ -0,0 +1,135 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* + * socketcan/can/isotp.h + * + * Definitions for isotp CAN sockets + * + * Author: Oliver Hartkopp + * Copyright (c) 2008 Volkswagen Group Electronic Research + * All rights reserved. + * + * Send feedback to + * + */ + +#ifndef CAN_ISOTP_H +#define CAN_ISOTP_H + +#include + +#define SOL_CAN_ISOTP (SOL_CAN_BASE + CAN_ISOTP) + +/* for socket options affecting the socket (not the global system) */ + +#define CAN_ISOTP_OPTS 1 /* pass struct can_isotp_options */ + +#define CAN_ISOTP_RECV_FC 2 /* pass struct can_isotp_fc_options */ + +/* sockopts to force stmin timer values for protocol regression tests */ + +#define CAN_ISOTP_TX_STMIN 3 /* pass __u32 value in nano secs */ + /* use this time instead of value */ + /* provided in FC from the receiver */ + +#define CAN_ISOTP_RX_STMIN 4 /* pass __u32 value in nano secs */ + /* ignore received CF frames which */ + /* timestamps differ less than val */ + +#define CAN_ISOTP_LL_OPTS 5 /* pass struct can_isotp_ll_options */ + +struct can_isotp_options { + + __u32 flags; /* set flags for isotp behaviour. */ + /* __u32 value : flags see below */ + + __u32 frame_txtime; /* frame transmission time (N_As/N_Ar) */ + /* __u32 value : time in nano secs */ + + __u8 ext_address; /* set address for extended addressing */ + /* __u8 value : extended address */ + + __u8 txpad_content; /* set content of padding byte (tx) */ + /* __u8 value : content on tx path */ + + __u8 rxpad_content; /* set content of padding byte (rx) */ + /* __u8 value : content on rx path */ + + __u8 rx_ext_address; /* set address for extended addressing */ + /* __u8 value : extended address (rx) */ +}; + +struct can_isotp_fc_options { + + __u8 bs; /* blocksize provided in FC frame */ + /* __u8 value : blocksize. 0 = off */ + + __u8 stmin; /* separation time provided in FC frame */ + /* __u8 value : */ + /* 0x00 - 0x7F : 0 - 127 ms */ + /* 0x80 - 0xF0 : reserved */ + /* 0xF1 - 0xF9 : 100 us - 900 us */ + /* 0xFA - 0xFF : reserved */ + + __u8 wftmax; /* max. number of wait frame transmiss. */ + /* __u8 value : 0 = omit FC N_PDU WT */ +}; + +struct can_isotp_ll_options { + + __u8 mtu; /* generated & accepted CAN frame type */ + /* __u8 value : */ + /* CAN_MTU (16) -> standard CAN 2.0 */ + /* CANFD_MTU (72) -> CAN FD frame */ + + __u8 tx_dl; /* tx link layer data length in bytes */ + /* (configured maximum payload length) */ + /* __u8 value : 8,12,16,20,24,32,48,64 */ + /* => rx path supports all LL_DL values */ + + __u8 tx_flags; /* set into struct canfd_frame.flags */ + /* at frame creation: e.g. CANFD_BRS */ + /* Obsolete when the BRS flag is fixed */ + /* by the CAN netdriver configuration */ +}; + +/* flags for isotp behaviour */ + +#define CAN_ISOTP_LISTEN_MODE 0x001 /* listen only (do not send FC) */ +#define CAN_ISOTP_EXTEND_ADDR 0x002 /* enable extended addressing */ +#define CAN_ISOTP_TX_PADDING 0x004 /* enable CAN frame padding tx path */ +#define CAN_ISOTP_RX_PADDING 0x008 /* enable CAN frame padding rx path */ +#define CAN_ISOTP_CHK_PAD_LEN 0x010 /* check received CAN frame padding */ +#define CAN_ISOTP_CHK_PAD_DATA 0x020 /* check received CAN frame padding */ +#define CAN_ISOTP_HALF_DUPLEX 0x040 /* half duplex error state handling */ +#define CAN_ISOTP_FORCE_TXSTMIN 0x080 /* ignore stmin from received FC */ +#define CAN_ISOTP_FORCE_RXSTMIN 0x100 /* ignore CFs depending on rx stmin */ +#define CAN_ISOTP_RX_EXT_ADDR 0x200 /* different rx extended addressing */ + + +/* default values */ + +#define CAN_ISOTP_DEFAULT_FLAGS 0 +#define CAN_ISOTP_DEFAULT_EXT_ADDRESS 0x00 +#define CAN_ISOTP_DEFAULT_PAD_CONTENT 0xCC /* prevent bit-stuffing */ +#define CAN_ISOTP_DEFAULT_FRAME_TXTIME 0 +#define CAN_ISOTP_DEFAULT_RECV_BS 0 +#define CAN_ISOTP_DEFAULT_RECV_STMIN 0x00 +#define CAN_ISOTP_DEFAULT_RECV_WFTMAX 0 + +#define CAN_ISOTP_DEFAULT_LL_MTU CAN_MTU +#define CAN_ISOTP_DEFAULT_LL_TX_DL CAN_MAX_DLEN +#define CAN_ISOTP_DEFAULT_LL_TX_FLAGS 0 + +/* + * Remark on CAN_ISOTP_DEFAULT_RECV_* values: + * + * We can strongly assume, that the Linux Kernel implementation of + * CAN_ISOTP is capable to run with BS=0, STmin=0 and WFTmax=0. + * But as we like to be able to behave as a commonly available ECU, + * these default settings can be changed via sockopts. + * For that reason the STmin value is intentionally _not_ checked for + * consistency and copied directly into the flow control (FC) frame. + * + */ + +#endif diff --git a/net/can/isotp.c b/net/can/isotp.c new file mode 100644 index 0000000000000..a0d20d9443bf5 --- /dev/null +++ b/net/can/isotp.c @@ -0,0 +1,1537 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* + * isotp.c - ISO 15765-2 CAN transport protocol for protocol family CAN + * + * WARNING: This is ALPHA code for discussions and first tests that should + * not be used in production environments. + * + * In the discussion the Socket-API to the userspace or the ISO-TP socket + * options or the return values we may change! Current behaviour: + * + * - no ISO-TP specific return values are provided to the userspace + * - when a transfer (tx) is on the run the next write() blocks until it's done + * - no support for sending wait frames to the data source in the rx path + * + * Copyright (c) 2017 Volkswagen Group Electronic Research + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of Volkswagen nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * Alternatively, provided that this notice is retained in full, this + * software may be distributed under the terms of the GNU General + * Public License ("GPL") version 2, in which case the provisions of the + * GPL apply INSTEAD OF those given above. + * + * The provided data structures and external interfaces from this code + * are not restricted to be used by modules with a GPL compatible license. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + * + * Send feedback to + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,9,0) +#include +#define CAN_SKBRES sizeof(struct can_skb_priv) +#else +#define CAN_SKBRES 0 +#endif +#include +#include +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,24) +#include +#endif +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,25) +#include "compat.h" +#endif + +#define CAN_ISOTP_VERSION "20181217" +static __initdata const char banner[] = + KERN_INFO "can: isotp protocol (rev " CAN_ISOTP_VERSION " alpha)\n"; + +MODULE_DESCRIPTION("PF_CAN isotp 15765-2:2016 protocol"); +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_AUTHOR("Oliver Hartkopp "); +MODULE_ALIAS("can-proto-6"); + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22) +#error This modules needs hrtimers (available since Kernel 2.6.22) +#endif + +#define DBG(fmt, args...) (printk( KERN_DEBUG "can-isotp: %s: " fmt, \ + __func__, ##args)) +#undef DBG +#define DBG(fmt, args...) + +#define SINGLE_MASK(id) ((id & CAN_EFF_FLAG) ? \ + (CAN_EFF_MASK | CAN_EFF_FLAG | CAN_RTR_FLAG) : \ + (CAN_SFF_MASK | CAN_EFF_FLAG | CAN_RTR_FLAG)) + +/* + ISO 15765-2:2016 supports more than 4095 byte per ISO PDU as the FF_DL can + take full 32 bit values (4 Gbyte). We would need some good concept to handle + this between user space and kernel space. For now increase the static buffer + to something about 8 kbyte to be able to test this new functionality. +*/ +#define MAX_MSG_LENGTH 8200 + +/* N_PCI type values in bits 7-4 of N_PCI bytes */ +#define N_PCI_SF 0x00 /* single frame */ +#define N_PCI_FF 0x10 /* first frame */ +#define N_PCI_CF 0x20 /* consecutive frame */ +#define N_PCI_FC 0x30 /* flow control */ + +#define N_PCI_SZ 1 /* size of the PCI byte #1 */ +#define SF_PCI_SZ4 1 /* size of SingleFrame PCI including 4 bit SF_DL */ +#define SF_PCI_SZ8 2 /* size of SingleFrame PCI including 8 bit SF_DL */ +#define FF_PCI_SZ12 2 /* size of FirstFrame PCI including 12 bit FF_DL */ +#define FF_PCI_SZ32 6 /* size of FirstFrame PCI including 32 bit FF_DL */ +#define FC_CONTENT_SZ 3 /* flow control content size in byte (FS/BS/STmin) */ + +#define ISOTP_CHECK_PADDING (CAN_ISOTP_CHK_PAD_LEN | CAN_ISOTP_CHK_PAD_DATA) + +/* Flow Status given in FC frame */ +#define ISOTP_FC_CTS 0 /* clear to send */ +#define ISOTP_FC_WT 1 /* wait */ +#define ISOTP_FC_OVFLW 2 /* overflow */ + +enum { + ISOTP_IDLE = 0, + ISOTP_WAIT_FIRST_FC, + ISOTP_WAIT_FC, + ISOTP_WAIT_DATA, + ISOTP_SENDING +}; + +struct tpcon { + int idx; + int len; + u8 state; + u8 bs; + u8 sn; + u8 ll_dl; + u8 buf[MAX_MSG_LENGTH+1]; +}; + +struct isotp_sock { + struct sock sk; + int bound; + int ifindex; + canid_t txid; + canid_t rxid; + ktime_t tx_gap; + ktime_t lastrxcf_tstamp; + struct hrtimer rxtimer, txtimer; + struct tasklet_struct txtsklet; + struct can_isotp_options opt; + struct can_isotp_fc_options rxfc, txfc; + struct can_isotp_ll_options ll; + __u32 force_tx_stmin; + __u32 force_rx_stmin; + struct tpcon rx, tx; + struct notifier_block notifier; + wait_queue_head_t wait; +}; + +static inline struct isotp_sock *isotp_sk(const struct sock *sk) +{ + return (struct isotp_sock *)sk; +} + +static enum hrtimer_restart isotp_rx_timer_handler(struct hrtimer *hrtimer) +{ + struct isotp_sock *so = container_of(hrtimer, struct isotp_sock, + rxtimer); + if (so->rx.state == ISOTP_WAIT_DATA) { + DBG("we did not get new data frames in time.\n"); + + /* reset tx state */ + so->rx.state = ISOTP_IDLE; + } + + return HRTIMER_NORESTART; +} + +static void isotp_skb_reserve(struct sk_buff *skb, struct net_device *dev) +{ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,9,0) + can_skb_reserve(skb); + can_skb_prv(skb)->ifindex = dev->ifindex; +#endif +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,1,5) + can_skb_prv(skb)->skbcnt = 0; +#endif +} + +static void isotp_skb_destructor(struct sk_buff *skb) +{ + sock_put(skb->sk); +} + +static inline void isotp_skb_set_owner(struct sk_buff *skb, struct sock *sk) +{ + if (sk) { + sock_hold(sk); + skb->destructor = isotp_skb_destructor; + skb->sk = sk; + } +} + +static int isotp_send_fc(struct sock *sk, int ae, u8 flowstatus) +{ + struct net_device *dev; + struct sk_buff *nskb; + struct canfd_frame *ncf; + struct isotp_sock *so = isotp_sk(sk); + + nskb = alloc_skb(so->ll.mtu + CAN_SKBRES, gfp_any()); + if (!nskb) + return 1; + + dev = dev_get_by_index(sock_net(sk), so->ifindex); + if (!dev) { + kfree_skb(nskb); + return 1; + } + isotp_skb_reserve(nskb, dev); + nskb->dev = dev; + isotp_skb_set_owner(nskb, sk); + ncf = (struct canfd_frame *) nskb->data; + skb_put(nskb, so->ll.mtu); + + /* create & send flow control reply */ + ncf->can_id = so->txid; + + if (so->opt.flags & CAN_ISOTP_TX_PADDING) { + memset(ncf->data, so->opt.txpad_content, CAN_MAX_DLEN); + ncf->len = CAN_MAX_DLEN; + } else + ncf->len = ae + FC_CONTENT_SZ; + + ncf->data[ae] = N_PCI_FC | flowstatus; + ncf->data[ae + 1] = so->rxfc.bs; + ncf->data[ae + 2] = so->rxfc.stmin; + + if (ae) + ncf->data[0] = so->opt.ext_address; + + if (so->ll.mtu == CANFD_MTU) + ncf->flags = so->ll.tx_flags; + + can_send(nskb, 1); + dev_put(dev); + + /* reset blocksize counter */ + so->rx.bs = 0; + + /* reset last CF frame rx timestamp for rx stmin enforcement */ + so->lastrxcf_tstamp = ktime_set(0,0); + + /* start rx timeout watchdog */ + hrtimer_start(&so->rxtimer, ktime_set(1,0), HRTIMER_MODE_REL); + return 0; +} + +static void isotp_rcv_skb(struct sk_buff *skb, struct sock *sk) +{ + struct sockaddr_can *addr = (struct sockaddr_can *)skb->cb; + + BUILD_BUG_ON(sizeof(skb->cb) < sizeof(struct sockaddr_can)); + + memset(addr, 0, sizeof(*addr)); + addr->can_family = AF_CAN; + addr->can_ifindex = skb->dev->ifindex; + + if (sock_queue_rcv_skb(sk, skb) < 0) + kfree_skb(skb); +} + +static u8 padlen(u8 datalen) +{ + const u8 plen[] = {8, 8, 8, 8, 8, 8, 8, 8, 8, /* 0 - 8 */ + 12, 12, 12, 12, /* 9 - 12 */ + 16, 16, 16, 16, /* 13 - 16 */ + 20, 20, 20, 20, /* 17 - 20 */ + 24, 24, 24, 24, /* 21 - 24 */ + 32, 32, 32, 32, 32, 32, 32, 32, /* 25 - 32 */ + 48, 48, 48, 48, 48, 48, 48, 48, /* 33 - 40 */ + 48, 48, 48, 48, 48, 48, 48, 48}; /* 41 - 48 */ + + if (datalen > 48) + return 64; + + return plen[datalen]; +} + +/* check for length optimization and return 1/true when the check fails */ +static int check_optimized(struct canfd_frame *cf, int start_index) +{ + /* + * for CAN_DL <= 8 the start_index is equal to the CAN_DL as the + * padding would start at this point. E.g. if the padding would + * start at cf.data[7] cf->len has to be 7 to be optimal. + * Note: The data[] index starts with zero. + */ + if (cf->len <= CAN_MAX_DLEN) + return (cf->len != start_index); + + /* + * This relation is also valid in the non-linear DLC range, where + * we need to take care of the minimal next possible CAN_DL. + * The correct check would be (padlen(cf->len) != padlen(start_index)). + * But as cf->len can only take discrete values from 12, .., 64 at this + * point the padlen(cf->len) is always equal to cf->len. + */ + return (cf->len != padlen(start_index)); +} + +/* check padding and return 1/true when the check fails */ +static int check_pad(struct isotp_sock *so, struct canfd_frame *cf, + int start_index, __u8 content) +{ + int i; + + /* no RX_PADDING value => check length of optimized frame length */ + if (!(so->opt.flags & CAN_ISOTP_RX_PADDING)) { + + if (so->opt.flags & CAN_ISOTP_CHK_PAD_LEN) + return check_optimized(cf, start_index); + + /* no valid test against empty value => ignore frame */ + return 1; + } + + /* check datalength of correctly padded CAN frame */ + if ((so->opt.flags & CAN_ISOTP_CHK_PAD_LEN) && + cf->len != padlen(cf->len)) + return 1; + + /* check padding content */ + if (so->opt.flags & CAN_ISOTP_CHK_PAD_DATA) { + for (i = start_index; i < cf->len; i++) + if (cf->data[i] != content) + return 1; + } + return 0; +} + +static int isotp_rcv_fc(struct isotp_sock *so, struct canfd_frame *cf, int ae) +{ + if (so->tx.state != ISOTP_WAIT_FC && + so->tx.state != ISOTP_WAIT_FIRST_FC) + return 0; + + hrtimer_cancel(&so->txtimer); + + if ((cf->len < ae + FC_CONTENT_SZ) || + ((so->opt.flags & ISOTP_CHECK_PADDING) && + check_pad(so, cf, ae + FC_CONTENT_SZ, so->opt.rxpad_content))) { + so->tx.state = ISOTP_IDLE; + wake_up_interruptible(&so->wait); + return 1; + } + + /* get communication parameters only from the first FC frame */ + if (so->tx.state == ISOTP_WAIT_FIRST_FC) { + + so->txfc.bs = cf->data[ae + 1]; + so->txfc.stmin = cf->data[ae + 2]; + + /* fix wrong STmin values according spec */ + if ((so->txfc.stmin > 0x7F) && + ((so->txfc.stmin < 0xF1) || (so->txfc.stmin > 0xF9))) + so->txfc.stmin = 0x7F; + + so->tx_gap = ktime_set(0,0); + /* add transmission time for CAN frame N_As */ + so->tx_gap = ktime_add_ns(so->tx_gap, so->opt.frame_txtime); + /* add waiting time for consecutive frames N_Cs */ + if (so->opt.flags & CAN_ISOTP_FORCE_TXSTMIN) + so->tx_gap = ktime_add_ns(so->tx_gap, + so->force_tx_stmin); + else if (so->txfc.stmin < 0x80) + so->tx_gap = ktime_add_ns(so->tx_gap, + so->txfc.stmin * 1000000); + else + so->tx_gap = ktime_add_ns(so->tx_gap, + (so->txfc.stmin - 0xF0) + * 100000); + so->tx.state = ISOTP_WAIT_FC; + } + + DBG("FC frame: FS %d, BS %d, STmin 0x%02X, tx_gap %lld\n", + cf->data[ae] & 0x0F & 0x0F, so->txfc.bs, so->txfc.stmin, +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) + (long long)so->tx_gap); +#else + (long long)so->tx_gap.tv64); +#endif + + switch (cf->data[ae] & 0x0F) { + + case ISOTP_FC_CTS: + so->tx.bs = 0; + so->tx.state = ISOTP_SENDING; + DBG("starting txtimer for sending\n"); + /* start cyclic timer for sending CF frame */ + hrtimer_start(&so->txtimer, so->tx_gap, + HRTIMER_MODE_REL); + break; + + case ISOTP_FC_WT: + DBG("starting waiting for next FC\n"); + /* start timer to wait for next FC frame */ + hrtimer_start(&so->txtimer, ktime_set(1,0), + HRTIMER_MODE_REL); + break; + + case ISOTP_FC_OVFLW: + DBG("overflow in receiver side\n"); + + default: + /* stop this tx job. TODO: error reporting? */ + so->tx.state = ISOTP_IDLE; + wake_up_interruptible(&so->wait); + } + return 0; +} + +static int isotp_rcv_sf(struct sock *sk, struct canfd_frame *cf, int pcilen, + struct sk_buff *skb, int len) +{ + struct isotp_sock *so = isotp_sk(sk); + struct sk_buff *nskb; + + hrtimer_cancel(&so->rxtimer); + so->rx.state = ISOTP_IDLE; + + if (!len || len > cf->len - pcilen) + return 1; + + if ((so->opt.flags & ISOTP_CHECK_PADDING) && + check_pad(so, cf, pcilen + len, so->opt.rxpad_content)) + return 1; + + nskb = alloc_skb(len, gfp_any()); + if (!nskb) + return 1; + + memcpy(skb_put(nskb, len), &cf->data[pcilen], len); + + nskb->tstamp = skb->tstamp; + nskb->dev = skb->dev; + isotp_rcv_skb(nskb, sk); + return 0; +} + +static int isotp_rcv_ff(struct sock *sk, struct canfd_frame *cf, int ae) +{ + struct isotp_sock *so = isotp_sk(sk); + int i; + int off; + int ff_pci_sz; + + hrtimer_cancel(&so->rxtimer); + so->rx.state = ISOTP_IDLE; + + /* get the used sender LL_DL from the (first) CAN frame data length */ + so->rx.ll_dl = padlen(cf->len); + + /* the first frame has to use the entire frame up to LL_DL length */ + if (cf->len != so->rx.ll_dl) + return 1; + + /* get the FF_DL */ + so->rx.len = (cf->data[ae] & 0x0F) << 8; + so->rx.len += cf->data[ae + 1]; + + /* Check for FF_DL escape sequence supporting 32 bit PDU length */ + if (so->rx.len) + ff_pci_sz = FF_PCI_SZ12; + else { + /* FF_DL = 0 => get real length from next 4 bytes */ + so->rx.len = cf->data[ae + 2] << 24; + so->rx.len += cf->data[ae + 3] << 16; + so->rx.len += cf->data[ae + 4] << 8; + so->rx.len += cf->data[ae + 5]; + ff_pci_sz = FF_PCI_SZ32; + } + + /* take care of a potential SF_DL ESC offset for TX_DL > 8 */ + off = (so->rx.ll_dl > CAN_MAX_DLEN)? 1:0; + + if (so->rx.len + ae + off + ff_pci_sz < so->rx.ll_dl) + return 1; + + if (so->rx.len > MAX_MSG_LENGTH) { + /* send FC frame with overflow status */ + isotp_send_fc(sk, ae, ISOTP_FC_OVFLW); + return 1; + } + + /* copy the first received data bytes */ + so->rx.idx = 0; + for (i = ae + ff_pci_sz; i < so->rx.ll_dl; i++) + so->rx.buf[so->rx.idx++] = cf->data[i]; + + /* initial setup for this pdu receiption */ + so->rx.sn = 1; + so->rx.state = ISOTP_WAIT_DATA; + + /* no creation of flow control frames */ + if (so->opt.flags & CAN_ISOTP_LISTEN_MODE) + return 0; + + /* send our first FC frame */ + isotp_send_fc(sk, ae, ISOTP_FC_CTS); + return 0; +} + +static int isotp_rcv_cf(struct sock *sk, struct canfd_frame *cf, int ae, + struct sk_buff *skb) +{ + struct isotp_sock *so = isotp_sk(sk); + struct sk_buff *nskb; + int i; + + if (so->rx.state != ISOTP_WAIT_DATA) + return 0; + + /* drop if timestamp gap is less than force_rx_stmin nano secs */ + if (so->opt.flags & CAN_ISOTP_FORCE_RXSTMIN) { + + if (ktime_to_ns(ktime_sub(skb->tstamp, so->lastrxcf_tstamp)) < + so->force_rx_stmin) + return 0; + + so->lastrxcf_tstamp = skb->tstamp; + } + + hrtimer_cancel(&so->rxtimer); + + /* CFs are never longer than the FF */ + if (cf->len > so->rx.ll_dl) + return 1; + + /* CFs have usually the LL_DL length */ + if (cf->len < so->rx.ll_dl) { + /* this is only allowed for the last CF */ + if (so->rx.len - so->rx.idx > so->rx.ll_dl - ae - N_PCI_SZ) + return 1; + } + + if ((cf->data[ae] & 0x0F) != so->rx.sn) { + DBG("wrong sn %d. expected %d.\n", + cf->data[ae] & 0x0F, so->rx.sn); + /* some error reporting? */ + so->rx.state = ISOTP_IDLE; + return 1; + } + so->rx.sn++; + so->rx.sn %= 16; + + for (i = ae + N_PCI_SZ; i < cf->len; i++) { + so->rx.buf[so->rx.idx++] = cf->data[i]; + if (so->rx.idx >= so->rx.len) + break; + } + + if (so->rx.idx >= so->rx.len) { + + /* we are done */ + so->rx.state = ISOTP_IDLE; + + if ((so->opt.flags & ISOTP_CHECK_PADDING) && + check_pad(so, cf, i+1, so->opt.rxpad_content)) + return 1; + + nskb = alloc_skb(so->rx.len, gfp_any()); + if (!nskb) + return 1; + + memcpy(skb_put(nskb, so->rx.len), so->rx.buf, + so->rx.len); + + nskb->tstamp = skb->tstamp; + nskb->dev = skb->dev; + isotp_rcv_skb(nskb, sk); + return 0; + } + + /* no creation of flow control frames */ + if (so->opt.flags & CAN_ISOTP_LISTEN_MODE) + return 0; + + /* perform blocksize handling, if enabled */ + if (!so->rxfc.bs || ++so->rx.bs < so->rxfc.bs) { + + /* start rx timeout watchdog */ + hrtimer_start(&so->rxtimer, ktime_set(1,0), + HRTIMER_MODE_REL); + return 0; + } + + /* we reached the specified blocksize so->rxfc.bs */ + isotp_send_fc(sk, ae, ISOTP_FC_CTS); + return 0; +} + +static void isotp_rcv(struct sk_buff *skb, void *data) +{ + struct sock *sk = (struct sock *)data; + struct isotp_sock *so = isotp_sk(sk); + struct canfd_frame *cf; + int ae = (so->opt.flags & CAN_ISOTP_EXTEND_ADDR)? 1:0; + u8 n_pci_type, sf_dl; + + BUG_ON(skb->len != CAN_MTU && skb->len != CANFD_MTU); + + /* + * Strictly receive only frames with the configured MTU size + * => clear separation of CAN2.0 / CAN FD transport channels + */ + if (skb->len != so->ll.mtu) + return; + + cf = (struct canfd_frame *) skb->data; + + /* if enabled: check receiption of my configured extended address */ + if (ae && cf->data[0] != so->opt.rx_ext_address) + return; + + n_pci_type = cf->data[ae] & 0xF0; + + if (so->opt.flags & CAN_ISOTP_HALF_DUPLEX) { + /* check rx/tx path half duplex expectations */ + if ((so->tx.state != ISOTP_IDLE && n_pci_type != N_PCI_FC) || + (so->rx.state != ISOTP_IDLE && n_pci_type == N_PCI_FC)) + return; + } + + switch (n_pci_type) { + case N_PCI_FC: + /* tx path: flow control frame containing the FC parameters */ + isotp_rcv_fc(so, cf, ae); + break; + + case N_PCI_SF: + /* + * rx path: single frame + * + * As we do not have a rx.ll_dl configuration, we can only test + * if the CAN frames payload length matches the LL_DL == 8 + * requirements - no matter if it's CAN 2.0 or CAN FD + */ + + /* get the SF_DL from the N_PCI byte */ + sf_dl = cf->data[ae] & 0x0F; + + if (cf->len <= CAN_MAX_DLEN) + isotp_rcv_sf(sk, cf, SF_PCI_SZ4 + ae, skb, sf_dl); + else if (skb->len == CANFD_MTU) { + /* + * We have a CAN FD frame and CAN_DL is greater than 8: + * Only frames with the SF_DL == 0 ESC value are valid. + * + * If so take care of the increased SF PCI size + * (SF_PCI_SZ8) to point to the message content behind + * the extended SF PCI info and get the real SF_DL + * length value from the formerly first data byte. + */ + if (sf_dl == 0) + isotp_rcv_sf(sk, cf, SF_PCI_SZ8 + ae, skb, + cf->data[SF_PCI_SZ4 + ae]); + } + break; + + case N_PCI_FF: + /* rx path: first frame */ + isotp_rcv_ff(sk, cf, ae); + break; + + case N_PCI_CF: + /* rx path: consecutive frame */ + isotp_rcv_cf(sk, cf, ae, skb); + break; + + } +} + +static void isotp_fill_dataframe(struct canfd_frame *cf, struct isotp_sock *so, + int ae, int off) +{ + int pcilen = N_PCI_SZ + ae + off; + int space = so->tx.ll_dl - pcilen; + int num = min_t(int, so->tx.len - so->tx.idx, space); + int i; + + cf->can_id = so->txid; + cf->len = num + pcilen; + + if (num < space) { + if (so->opt.flags & CAN_ISOTP_TX_PADDING) { + /* user requested padding */ + cf->len = padlen(cf->len); + memset(cf->data, so->opt.txpad_content, cf->len); + } else if (cf->len > CAN_MAX_DLEN) { + /* mandatory padding for CAN FD frames */ + cf->len = padlen(cf->len); + memset(cf->data, CAN_ISOTP_DEFAULT_PAD_CONTENT, + cf->len); + } + } + + for (i = 0; i < num; i++) + cf->data[pcilen + i] = so->tx.buf[so->tx.idx++]; + + if (ae) + cf->data[0] = so->opt.ext_address; +} + +static void isotp_create_fframe(struct canfd_frame *cf, struct isotp_sock *so, + int ae) +{ + int i; + int ff_pci_sz; + + cf->can_id = so->txid; + cf->len = so->tx.ll_dl; + if (ae) + cf->data[0] = so->opt.ext_address; + + /* create N_PCI bytes with 12/32 bit FF_DL data length */ + if (so->tx.len > 4095) { + /* use 32 bit FF_DL notation */ + cf->data[ae] = N_PCI_FF; + cf->data[ae + 1] = 0; + cf->data[ae + 2] = (u8) (so->tx.len >> 24) & 0xFFU; + cf->data[ae + 3] = (u8) (so->tx.len >> 16) & 0xFFU; + cf->data[ae + 4] = (u8) (so->tx.len >> 8) & 0xFFU; + cf->data[ae + 5] = (u8) so->tx.len & 0xFFU; + ff_pci_sz = FF_PCI_SZ32; + } else { + /* use 12 bit FF_DL notation */ + cf->data[ae] = (u8) (so->tx.len>>8) | N_PCI_FF; + cf->data[ae + 1] = (u8) so->tx.len & 0xFFU; + ff_pci_sz = FF_PCI_SZ12; + } + + /* add first data bytes depending on ae */ + for (i = ae + ff_pci_sz; i < so->tx.ll_dl; i++) + cf->data[i] = so->tx.buf[so->tx.idx++]; + + so->tx.sn = 1; + so->tx.state = ISOTP_WAIT_FIRST_FC; +} + +static void isotp_tx_timer_tsklet(unsigned long data) +{ + struct isotp_sock *so = (struct isotp_sock *)data; + struct sock *sk = &so->sk; + struct sk_buff *skb; + struct net_device *dev; + struct canfd_frame *cf; + int ae = (so->opt.flags & CAN_ISOTP_EXTEND_ADDR)? 1:0; + + switch (so->tx.state) { + + case ISOTP_WAIT_FC: + case ISOTP_WAIT_FIRST_FC: + + /* we did not get any flow control frame in time */ + + DBG("we did not get FC frame in time.\n"); + + /* reset tx state */ + so->tx.state = ISOTP_IDLE; + wake_up_interruptible(&so->wait); + break; + + case ISOTP_SENDING: + + /* push out the next segmented pdu */ + + DBG("next pdu to send.\n"); + + dev = dev_get_by_index(sock_net(sk), so->ifindex); + if (!dev) + break; + +isotp_tx_burst: + skb = alloc_skb(so->ll.mtu + CAN_SKBRES, gfp_any()); + if (!skb) { + dev_put(dev); + break; + } + + isotp_skb_reserve(skb, dev); + cf = (struct canfd_frame *)skb->data; + skb_put(skb, so->ll.mtu); + + /* create consecutive frame */ + isotp_fill_dataframe(cf, so, ae, 0); + + /* place consecutive frame N_PCI in appropriate index */ + cf->data[ae] = N_PCI_CF | so->tx.sn++; + so->tx.sn %= 16; + so->tx.bs++; + + if (so->ll.mtu == CANFD_MTU) + cf->flags = so->ll.tx_flags; + + skb->dev = dev; + isotp_skb_set_owner(skb, sk); + can_send(skb, 1); + + if (so->tx.idx >= so->tx.len) { + /* we are done */ + DBG("we are done\n"); + so->tx.state = ISOTP_IDLE; + dev_put(dev); + wake_up_interruptible(&so->wait); + break; + } + + if (so->txfc.bs && so->tx.bs >= so->txfc.bs) { + /* stop and wait for FC */ + DBG("BS stop and wait for FC\n"); + so->tx.state = ISOTP_WAIT_FC; + dev_put(dev); + hrtimer_start(&so->txtimer, + ktime_add(ktime_get(), ktime_set(1,0)), + HRTIMER_MODE_ABS); + break; + } + + /* no gap between data frames needed => use burst mode */ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) + if (!so->tx_gap) + goto isotp_tx_burst; +#else + if (!so->tx_gap.tv64) + goto isotp_tx_burst; +#endif + + /* start timer to send next data frame with correct delay */ + dev_put(dev); + hrtimer_start(&so->txtimer, + ktime_add(ktime_get(), so->tx_gap), + HRTIMER_MODE_ABS); + break; + + default: + BUG_ON(1); + } +} + +static enum hrtimer_restart isotp_tx_timer_handler(struct hrtimer *hrtimer) +{ + struct isotp_sock *so = container_of(hrtimer, struct isotp_sock, + txtimer); + tasklet_schedule(&so->txtsklet); + + return HRTIMER_NORESTART; +} + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,1,0) +static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) +#else +static int isotp_sendmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t size) +#endif +{ + struct sock *sk = sock->sk; + struct isotp_sock *so = isotp_sk(sk); + struct sk_buff *skb; + struct net_device *dev; + struct canfd_frame *cf; + int ae = (so->opt.flags & CAN_ISOTP_EXTEND_ADDR)? 1:0; + int off; + int err; + + if (!so->bound) + return -EADDRNOTAVAIL; + + /* we do not support multiple buffers - for now */ + if (so->tx.state != ISOTP_IDLE) { + if (msg->msg_flags & MSG_DONTWAIT) + return -EAGAIN; + + /* wait for complete transmission of current pdu */ + wait_event_interruptible(so->wait, so->tx.state == ISOTP_IDLE); + } + + if (!size || size > MAX_MSG_LENGTH) + return -EINVAL; + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,19,0) + err = memcpy_from_msg(so->tx.buf, msg, size); +#else + err = memcpy_fromiovec(so->tx.buf, msg->msg_iov, size); +#endif + if (err < 0) + return err; + + dev = dev_get_by_index(sock_net(sk), so->ifindex); + if (!dev) + return -ENXIO; + + skb = sock_alloc_send_skb(sk, so->ll.mtu + CAN_SKBRES, + msg->msg_flags & MSG_DONTWAIT, &err); + if (!skb) { + dev_put(dev); + return err; + } + + isotp_skb_reserve(skb, dev); + + so->tx.state = ISOTP_SENDING; + so->tx.len = size; + so->tx.idx = 0; + + cf = (struct canfd_frame *)skb->data; + skb_put(skb, so->ll.mtu); + + /* take care of a potential SF_DL ESC offset for TX_DL > 8 */ + off = (so->tx.ll_dl > CAN_MAX_DLEN)? 1:0; + + /* check for single frame transmission depending on TX_DL */ + if (size <= so->tx.ll_dl - SF_PCI_SZ4 - ae - off) { + + /* + * The message size generally fits into a SingleFrame - good. + * + * SF_DL ESC offset optimization: + * + * When TX_DL is greater 8 but the message would still fit + * into a 8 byte CAN frame, we can omit the offset. + * This prevents a protocol caused length extension from + * CAN_DL = 8 to CAN_DL = 12 due to the SF_SL ESC handling. + */ + if (size <= CAN_MAX_DLEN - SF_PCI_SZ4 - ae) + off = 0; + + isotp_fill_dataframe(cf, so, ae, off); + + /* place single frame N_PCI w/o length in appropriate index */ + cf->data[ae] = N_PCI_SF; + + /* place SF_DL size value depending on the SF_DL ESC offset */ + if (off) + cf->data[SF_PCI_SZ4 + ae] = size; + else + cf->data[ae] |= size; + + so->tx.state = ISOTP_IDLE; + wake_up_interruptible(&so->wait); + } else { + /* send first frame and wait for FC */ + + isotp_create_fframe(cf, so, ae); + + DBG("starting txtimer for fc\n"); + /* start timeout for FC */ + hrtimer_start(&so->txtimer, ktime_set(1,0), HRTIMER_MODE_REL); + } + + /* send the first or only CAN frame */ + if (so->ll.mtu == CANFD_MTU) + cf->flags = so->ll.tx_flags; + + skb->dev = dev; + skb->sk = sk; + err = can_send(skb, 1); + dev_put(dev); + if (err) + return err; + + return size; +} + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,1,0) +static int isotp_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, + int flags) +#else +static int isotp_recvmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t size, int flags) +#endif +{ + struct sock *sk = sock->sk; + struct sk_buff *skb; + int err = 0; + int noblock; + + noblock = flags & MSG_DONTWAIT; + flags &= ~MSG_DONTWAIT; + + skb = skb_recv_datagram(sk, flags, noblock, &err); + if (!skb) + return err; + + if (size < skb->len) + msg->msg_flags |= MSG_TRUNC; + else + size = skb->len; + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,19,0) + err = memcpy_to_msg(msg, skb->data, size); +#else + err = memcpy_toiovec(msg->msg_iov, skb->data, size); +#endif + if (err < 0) { + skb_free_datagram(sk, skb); + return err; + } + + sock_recv_timestamp(msg, sk, skb); + + if (msg->msg_name) { + msg->msg_namelen = sizeof(struct sockaddr_can); + memcpy(msg->msg_name, skb->cb, msg->msg_namelen); + } + + skb_free_datagram(sk, skb); + + return size; +} + +static int isotp_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + struct isotp_sock *so; + struct net *net; + + if (!sk) + return 0; + + so = isotp_sk(sk); + net = sock_net(sk); + + /* wait for complete transmission of current pdu */ + wait_event_interruptible(so->wait, so->tx.state == ISOTP_IDLE); + + unregister_netdevice_notifier(&so->notifier); + + lock_sock(sk); + + hrtimer_cancel(&so->txtimer); + hrtimer_cancel(&so->rxtimer); + tasklet_kill(&so->txtsklet); + + /* remove current filters & unregister */ + if (so->bound) { + if (so->ifindex) { + struct net_device *dev; + + dev = dev_get_by_index(net, so->ifindex); + if (dev) { +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,12,0) + can_rx_unregister(net, dev, so->rxid, +#else + can_rx_unregister(dev, so->rxid, +#endif + SINGLE_MASK(so->rxid), + isotp_rcv, sk); + dev_put(dev); + } + } + } + + so->ifindex = 0; + so->bound = 0; + + sock_orphan(sk); + sock->sk = NULL; + + release_sock(sk); + sock_put(sk); + + return 0; +} + +static int isotp_bind(struct socket *sock, struct sockaddr *uaddr, int len) +{ + struct sockaddr_can *addr = (struct sockaddr_can *)uaddr; + struct sock *sk = sock->sk; + struct isotp_sock *so = isotp_sk(sk); + struct net *net = sock_net(sk); + int ifindex; + struct net_device *dev; + int err = 0; + int notify_enetdown = 0; + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5,4,0) + if (len < CAN_REQUIRED_SIZE(struct sockaddr_can, can_addr.tp)) +#else + if (len < sizeof(*addr)) +#endif + return -EINVAL; + + if (addr->can_addr.tp.rx_id == addr->can_addr.tp.tx_id) + return -EADDRNOTAVAIL; + + if ((addr->can_addr.tp.rx_id | addr->can_addr.tp.tx_id) & + (CAN_ERR_FLAG | CAN_RTR_FLAG)) + return -EADDRNOTAVAIL; + + if (!addr->can_ifindex) + return -ENODEV; + + lock_sock(sk); + + if (so->bound && addr->can_ifindex == so->ifindex && + addr->can_addr.tp.rx_id == so->rxid && + addr->can_addr.tp.tx_id == so->txid) + goto out; + + dev = dev_get_by_index(net, addr->can_ifindex); + if (!dev) { + err = -ENODEV; + goto out; + } + if (dev->type != ARPHRD_CAN) { + dev_put(dev); + err = -ENODEV; + goto out; + } + if (dev->mtu < so->ll.mtu) { + dev_put(dev); + err = -EINVAL; + goto out; + } + if (!(dev->flags & IFF_UP)) + notify_enetdown = 1; + + ifindex = dev->ifindex; + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,12,0) + can_rx_register(net, dev, addr->can_addr.tp.rx_id, +#else + can_rx_register(dev, addr->can_addr.tp.rx_id, +#endif + SINGLE_MASK(addr->can_addr.tp.rx_id), isotp_rcv, sk, +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4,9,11)) || \ + ((LINUX_VERSION_CODE >= KERNEL_VERSION(4,4,50)) && (LINUX_VERSION_CODE < KERNEL_VERSION(4, 5, 0))) || \ + ((LINUX_VERSION_CODE >= KERNEL_VERSION(3,18,49)) && (LINUX_VERSION_CODE < KERNEL_VERSION(3, 19, 0))) || \ + ((LINUX_VERSION_CODE >= KERNEL_VERSION(3,16,42)) && (LINUX_VERSION_CODE < KERNEL_VERSION(3, 17, 0))) + "isotp", sk); +#else + "isotp"); +#endif + dev_put(dev); + + if (so->bound) { + /* unregister old filter */ + if (so->ifindex) { + dev = dev_get_by_index(net, so->ifindex); + if (dev) { +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,12,0) + can_rx_unregister(net, dev, so->rxid, +#else + can_rx_unregister(dev, so->rxid, +#endif + SINGLE_MASK(so->rxid), + isotp_rcv, sk); + dev_put(dev); + } + } + } + + /* switch to new settings */ + so->ifindex = ifindex; + so->rxid = addr->can_addr.tp.rx_id; + so->txid = addr->can_addr.tp.tx_id; + so->bound = 1; + +out: + release_sock(sk); + + if (notify_enetdown) { + sk->sk_err = ENETDOWN; + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_error_report(sk); + } + + return err; +} + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,17,0) +static int isotp_getname(struct socket *sock, struct sockaddr *uaddr, int peer) +#else +static int isotp_getname(struct socket *sock, struct sockaddr *uaddr, + int *len, int peer) +#endif +{ + struct sockaddr_can *addr = (struct sockaddr_can *)uaddr; + struct sock *sk = sock->sk; + struct isotp_sock *so = isotp_sk(sk); + + if (peer) + return -EOPNOTSUPP; + + addr->can_family = AF_CAN; + addr->can_ifindex = so->ifindex; + addr->can_addr.tp.rx_id = so->rxid; + addr->can_addr.tp.tx_id = so->txid; + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,17,0) + return sizeof(*addr); +#else + *len = sizeof(*addr); + + return 0; +#endif +} + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,32) +static int isotp_setsockopt(struct socket *sock, int level, int optname, + char __user *optval, unsigned int optlen) +#else +static int isotp_setsockopt(struct socket *sock, int level, int optname, + char __user *optval, int optlen) +#endif +{ + struct sock *sk = sock->sk; + struct isotp_sock *so = isotp_sk(sk); + int ret = 0; + + if (level != SOL_CAN_ISOTP) + return -EINVAL; + if (optlen < 0) + return -EINVAL; + + switch (optname) { + + case CAN_ISOTP_OPTS: + if (optlen != sizeof(struct can_isotp_options)) + return -EINVAL; + + if (copy_from_user(&so->opt, optval, optlen)) + return -EFAULT; + + /* no separate rx_ext_address is given => use ext_address */ + if (!(so->opt.flags & CAN_ISOTP_RX_EXT_ADDR)) + so->opt.rx_ext_address = so->opt.ext_address; + break; + + case CAN_ISOTP_RECV_FC: + if (optlen != sizeof(struct can_isotp_fc_options)) + return -EINVAL; + + if (copy_from_user(&so->rxfc, optval, optlen)) + return -EFAULT; + break; + + case CAN_ISOTP_TX_STMIN: + if (optlen != sizeof(__u32)) + return -EINVAL; + + if (copy_from_user(&so->force_tx_stmin, optval, optlen)) + return -EFAULT; + break; + + case CAN_ISOTP_RX_STMIN: + if (optlen != sizeof(__u32)) + return -EINVAL; + + if (copy_from_user(&so->force_rx_stmin, optval, optlen)) + return -EFAULT; + break; + + case CAN_ISOTP_LL_OPTS: + if (optlen != sizeof(struct can_isotp_ll_options)) + return -EINVAL; + else { + struct can_isotp_ll_options ll; + + if (copy_from_user(&ll, optval, optlen)) + return -EFAULT; + + /* check for correct ISO 11898-1 DLC data lentgh */ + if (ll.tx_dl != padlen(ll.tx_dl)) + return -EINVAL; + + if (ll.mtu != CAN_MTU && ll.mtu != CANFD_MTU) + return -EINVAL; + + if (ll.mtu == CAN_MTU && ll.tx_dl > CAN_MAX_DLEN) + return -EINVAL; + + memcpy(&so->ll, &ll, sizeof(ll)); + + /* set ll_dl for tx path to similar place as for rx */ + so->tx.ll_dl = ll.tx_dl; + } + break; + + default: + ret = -ENOPROTOOPT; + } + + return ret; +} + +static int isotp_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct sock *sk = sock->sk; + struct isotp_sock *so = isotp_sk(sk); + int len; + void *val; + + if (level != SOL_CAN_ISOTP) + return -EINVAL; + if (get_user(len, optlen)) + return -EFAULT; + if (len < 0) + return -EINVAL; + + switch (optname) { + + case CAN_ISOTP_OPTS: + len = min_t(int, len, sizeof(struct can_isotp_options)); + val = &so->opt; + break; + + case CAN_ISOTP_RECV_FC: + len = min_t(int, len, sizeof(struct can_isotp_fc_options)); + val = &so->rxfc; + break; + + case CAN_ISOTP_TX_STMIN: + len = min_t(int, len, sizeof(__u32)); + val = &so->force_tx_stmin; + break; + + case CAN_ISOTP_RX_STMIN: + len = min_t(int, len, sizeof(__u32)); + val = &so->force_rx_stmin; + break; + + case CAN_ISOTP_LL_OPTS: + len = min_t(int, len, sizeof(struct can_isotp_ll_options)); + val = &so->ll; + break; + + default: + return -ENOPROTOOPT; + } + + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, val, len)) + return -EFAULT; + return 0; +} + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,11,0) +static int isotp_notifier(struct notifier_block *nb, unsigned long msg, + void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); +#else +static int isotp_notifier(struct notifier_block *nb, + unsigned long msg, void *data) +{ + struct net_device *dev = (struct net_device *)data; +#endif + struct isotp_sock *so = container_of(nb, struct isotp_sock, notifier); + struct sock *sk = &so->sk; + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,12,0) + if (!net_eq(dev_net(dev), sock_net(sk))) + return NOTIFY_DONE; +#elif LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,26) + if (dev_net(dev) != &init_net) + return NOTIFY_DONE; +#elif LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,24) + if (dev->nd_net != &init_net) + return NOTIFY_DONE; +#endif + + if (dev->type != ARPHRD_CAN) + return NOTIFY_DONE; + + if (so->ifindex != dev->ifindex) + return NOTIFY_DONE; + + switch (msg) { + + case NETDEV_UNREGISTER: + lock_sock(sk); + /* remove current filters & unregister */ + if (so->bound) +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,12,0) + can_rx_unregister(dev_net(dev), dev, so->rxid, +#else + can_rx_unregister(dev, so->rxid, +#endif + SINGLE_MASK(so->rxid), + isotp_rcv, sk); + + so->ifindex = 0; + so->bound = 0; + release_sock(sk); + + sk->sk_err = ENODEV; + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_error_report(sk); + break; + + case NETDEV_DOWN: + sk->sk_err = ENETDOWN; + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_error_report(sk); + break; + } + + return NOTIFY_DONE; +} + +static int isotp_init(struct sock *sk) +{ + struct isotp_sock *so = isotp_sk(sk); + + so->ifindex = 0; + so->bound = 0; + + so->opt.flags = CAN_ISOTP_DEFAULT_FLAGS; + so->opt.ext_address = CAN_ISOTP_DEFAULT_EXT_ADDRESS; + so->opt.rx_ext_address = CAN_ISOTP_DEFAULT_EXT_ADDRESS; + so->opt.rxpad_content = CAN_ISOTP_DEFAULT_PAD_CONTENT; + so->opt.txpad_content = CAN_ISOTP_DEFAULT_PAD_CONTENT; + so->opt.frame_txtime = CAN_ISOTP_DEFAULT_FRAME_TXTIME; + so->rxfc.bs = CAN_ISOTP_DEFAULT_RECV_BS; + so->rxfc.stmin = CAN_ISOTP_DEFAULT_RECV_STMIN; + so->rxfc.wftmax = CAN_ISOTP_DEFAULT_RECV_WFTMAX; + so->ll.mtu = CAN_ISOTP_DEFAULT_LL_MTU; + so->ll.tx_dl = CAN_ISOTP_DEFAULT_LL_TX_DL; + so->ll.tx_flags = CAN_ISOTP_DEFAULT_LL_TX_FLAGS; + + /* set ll_dl for tx path to similar place as for rx */ + so->tx.ll_dl = so->ll.tx_dl; + + so->rx.state = ISOTP_IDLE; + so->tx.state = ISOTP_IDLE; + + hrtimer_init(&so->rxtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + so->rxtimer.function = isotp_rx_timer_handler; + hrtimer_init(&so->txtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + so->txtimer.function = isotp_tx_timer_handler; + + tasklet_init(&so->txtsklet, isotp_tx_timer_tsklet, (unsigned long)so); + + init_waitqueue_head(&so->wait); + + so->notifier.notifier_call = isotp_notifier; + register_netdevice_notifier(&so->notifier); + + return 0; +} + +int isotp_sock_no_ioctlcmd(struct socket *sock, unsigned int cmd, + unsigned long arg) +{ + /* no ioctls for socket layer -> hand it down to NIC layer */ + return -ENOIOCTLCMD; +} + +static const struct proto_ops isotp_ops = { + .family = PF_CAN, + .release = isotp_release, + .bind = isotp_bind, + .connect = sock_no_connect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .getname = isotp_getname, + .poll = datagram_poll, + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5,2,0) + .ioctl = isotp_sock_no_ioctlcmd, + .gettstamp = sock_gettstamp, +#else + .ioctl = can_ioctl, /* use can_ioctl() from af_can.c */ +#endif + .listen = sock_no_listen, + .shutdown = sock_no_shutdown, + .setsockopt = isotp_setsockopt, + .getsockopt = isotp_getsockopt, + .sendmsg = isotp_sendmsg, + .recvmsg = isotp_recvmsg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; + +static struct proto isotp_proto __read_mostly = { + .name = "CAN_ISOTP", + .owner = THIS_MODULE, + .obj_size = sizeof(struct isotp_sock), + .init = isotp_init, +}; + +static const struct can_proto isotp_can_proto = { + .type = SOCK_DGRAM, + .protocol = CAN_ISOTP, +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,33) + .capability = -1, +#endif + .ops = &isotp_ops, + .prot = &isotp_proto, +}; + +static __init int isotp_module_init(void) +{ + int err; + + printk(banner); + + err = can_proto_register(&isotp_can_proto); + if (err < 0) + printk(KERN_ERR "can: registration of isotp protocol failed\n"); + + return err; +} + +static __exit void isotp_module_exit(void) +{ + can_proto_unregister(&isotp_can_proto); +} + +module_init(isotp_module_init); +module_exit(isotp_module_exit); From 0eeca08536b3c2f2e6d0877324612cbcab421fe0 Mon Sep 17 00:00:00 2001 From: Robert Nelson Date: Mon, 29 Jul 2019 11:58:09 -0500 Subject: [PATCH 002/133] CAN_ISOTP: wire up to kconfig/makefile Signed-off-by: Robert Nelson --- net/can/Kconfig | 10 ++++++++++ net/can/Makefile | 3 +++ 2 files changed, 13 insertions(+) diff --git a/net/can/Kconfig b/net/can/Kconfig index d770427524577..9e2f196f0423e 100644 --- a/net/can/Kconfig +++ b/net/can/Kconfig @@ -55,6 +55,16 @@ config CAN_GW source "net/can/j1939/Kconfig" +config CAN_ISOTP + tristate "ISO 15765-2:2016 CAN transport protocol" + default y + ---help--- + ISO 15765-2 CAN transport protocol for protocol family CAN + + This implementation is already widely used in automotive use-cases, e.g. + for UDS based OBD diagnosis. Although some small adaptions may be applied + to make it ready for Linux Mainline. Feedback is welcome. + source "drivers/net/can/Kconfig" endif diff --git a/net/can/Makefile b/net/can/Makefile index 08bd217fc0510..cfa1024369cfb 100644 --- a/net/can/Makefile +++ b/net/can/Makefile @@ -16,4 +16,7 @@ can-bcm-y := bcm.o obj-$(CONFIG_CAN_GW) += can-gw.o can-gw-y := gw.o +obj-$(CONFIG_CAN_ISOTP) += can-isotp.o +can-isotp-y := isotp.o + obj-$(CONFIG_CAN_J1939) += j1939/ From 6c67453e56157eddad2ec2dc10487b5171ce9d34 Mon Sep 17 00:00:00 2001 From: Robert Nelson Date: Thu, 23 Jul 2020 14:54:35 -0500 Subject: [PATCH 003/133] merge: CONFIG_PREEMPT_RT Patch Set patch-5.4.52-rt31.patch.xz Signed-off-by: Robert Nelson --- .../Expedited-Grace-Periods.html | 8 +- .../RCU/Design/Requirements/Requirements.html | 24 +- Documentation/RCU/checklist.txt | 4 +- Documentation/RCU/rcubarrier.txt | 8 +- Documentation/RCU/stallwarn.txt | 4 +- Documentation/RCU/whatisRCU.txt | 7 +- Documentation/admin-guide/sysctl/vm.rst | 3 + Documentation/printk-ringbuffer.txt | 377 ++++ Documentation/trace/ftrace-uses.rst | 2 +- arch/Kconfig | 1 + arch/alpha/include/asm/spinlock_types.h | 4 - arch/arc/kernel/entry.S | 6 +- arch/arm/Kconfig | 4 +- arch/arm/include/asm/irq.h | 2 + arch/arm/include/asm/spinlock_types.h | 4 - arch/arm/include/asm/switch_to.h | 10 +- arch/arm/include/asm/thread_info.h | 8 +- arch/arm/kernel/asm-offsets.c | 1 + arch/arm/kernel/entry-armv.S | 23 +- arch/arm/kernel/entry-common.S | 9 +- arch/arm/kernel/signal.c | 3 +- arch/arm/kernel/smp.c | 2 - arch/arm/kernel/traps.c | 2 + arch/arm/mm/cache-v7.S | 4 +- arch/arm/mm/cache-v7m.S | 4 +- arch/arm/mm/fault.c | 6 + arch/arm/mm/highmem.c | 58 +- arch/arm64/Kconfig | 54 +- arch/arm64/crypto/sha256-glue.c | 2 +- arch/arm64/include/asm/assembler.h | 6 +- arch/arm64/include/asm/kvm_mmu.h | 1 + arch/arm64/include/asm/preempt.h | 29 +- arch/arm64/include/asm/spinlock_types.h | 4 - arch/arm64/include/asm/thread_info.h | 6 +- arch/arm64/kernel/asm-offsets.c | 1 + arch/arm64/kernel/entry.S | 15 +- arch/arm64/kernel/fpsimd.c | 14 +- arch/arm64/kernel/signal.c | 2 +- arch/arm64/kernel/smp.c | 4 + arch/arm64/kernel/traps.c | 3 + arch/arm64/kvm/va_layout.c | 8 +- arch/c6x/kernel/entry.S | 8 +- arch/csky/kernel/entry.S | 4 +- arch/h8300/kernel/entry.S | 6 +- arch/hexagon/include/asm/spinlock_types.h | 4 - arch/hexagon/kernel/vm_entry.S | 6 +- arch/ia64/include/asm/spinlock_types.h | 4 - arch/ia64/kernel/entry.S | 12 +- arch/ia64/kernel/kprobes.c | 2 +- arch/m68k/coldfire/entry.S | 2 +- arch/microblaze/kernel/entry.S | 2 +- arch/mips/Kconfig | 2 +- arch/mips/include/asm/asmmacro.h | 4 +- arch/mips/kernel/entry.S | 6 +- arch/nds32/Kconfig | 2 +- arch/nds32/kernel/ex-exit.S | 4 +- arch/nios2/kernel/entry.S | 2 +- arch/parisc/Kconfig | 2 +- arch/parisc/kernel/entry.S | 10 +- arch/powerpc/Kconfig | 6 +- arch/powerpc/include/asm/spinlock_types.h | 4 - arch/powerpc/include/asm/stackprotector.h | 4 + arch/powerpc/include/asm/thread_info.h | 16 +- arch/powerpc/kernel/asm-offsets.c | 1 + arch/powerpc/kernel/entry_32.S | 27 +- arch/powerpc/kernel/entry_64.S | 28 +- arch/powerpc/kernel/irq.c | 2 + arch/powerpc/kernel/misc_32.S | 2 + arch/powerpc/kernel/misc_64.S | 2 + arch/powerpc/kernel/traps.c | 8 +- arch/powerpc/kernel/watchdog.c | 5 - arch/powerpc/kvm/Kconfig | 1 + arch/powerpc/platforms/ps3/device-init.c | 4 +- arch/powerpc/platforms/pseries/iommu.c | 17 +- arch/riscv/kernel/entry.S | 4 +- arch/s390/Kconfig | 2 +- arch/s390/include/asm/preempt.h | 4 +- arch/s390/include/asm/spinlock_types.h | 4 - arch/s390/kernel/dumpstack.c | 2 + arch/s390/kernel/entry.S | 2 +- arch/sh/Kconfig | 2 +- arch/sh/include/asm/spinlock_types.h | 4 - arch/sh/kernel/cpu/sh5/entry.S | 4 +- arch/sh/kernel/entry-common.S | 4 +- arch/sh/kernel/irq.c | 2 + arch/sparc/Kconfig | 2 +- arch/sparc/kernel/irq_64.c | 2 + arch/sparc/kernel/rtrap_64.S | 2 +- arch/x86/Kconfig | 6 +- arch/x86/crypto/aesni-intel_glue.c | 22 +- arch/x86/crypto/cast5_avx_glue.c | 21 +- arch/x86/crypto/chacha_glue.c | 11 +- arch/x86/crypto/glue_helper.c | 26 +- arch/x86/entry/common.c | 11 +- arch/x86/entry/entry_32.S | 18 + arch/x86/entry/entry_64.S | 18 + arch/x86/include/asm/fpu/api.h | 1 + arch/x86/include/asm/preempt.h | 33 +- arch/x86/include/asm/signal.h | 13 + arch/x86/include/asm/stackprotector.h | 8 +- arch/x86/include/asm/thread_info.h | 11 + arch/x86/kernel/apic/io_apic.c | 16 +- arch/x86/kernel/asm-offsets.c | 5 + arch/x86/kernel/cpu/mshyperv.c | 3 +- arch/x86/kernel/fpu/core.c | 12 + arch/x86/kernel/irq_32.c | 2 + arch/x86/kernel/process_32.c | 32 + arch/x86/kvm/x86.c | 8 + arch/x86/mm/highmem_32.c | 13 +- arch/x86/mm/iomap_32.c | 11 +- arch/x86/mm/tlb.c | 2 +- arch/xtensa/include/asm/spinlock_types.h | 4 - arch/xtensa/kernel/entry.S | 2 +- arch/xtensa/kernel/traps.c | 7 +- block/blk-ioc.c | 3 +- block/blk-mq.c | 18 +- block/blk-softirq.c | 6 +- crypto/cryptd.c | 19 +- drivers/block/zram/zcomp.c | 13 +- drivers/block/zram/zcomp.h | 1 + drivers/block/zram/zram_drv.c | 41 +- drivers/block/zram/zram_drv.h | 1 + drivers/char/random.c | 11 +- drivers/char/tpm/tpm-dev-common.c | 1 - drivers/char/tpm/tpm_tis.c | 29 +- drivers/clocksource/Kconfig | 7 + drivers/clocksource/timer-atmel-tcb.c | 73 +- drivers/connector/cn_proc.c | 6 +- drivers/dma-buf/dma-buf.c | 8 +- drivers/dma-buf/dma-resv.c | 45 +- drivers/firmware/efi/efi.c | 5 +- drivers/gpu/drm/Kconfig | 2 +- .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 6 +- drivers/gpu/drm/i915/display/intel_sprite.c | 13 +- drivers/gpu/drm/i915/gem/i915_gem_busy.c | 6 +- drivers/gpu/drm/i915/gt/intel_breadcrumbs.c | 19 +- drivers/gpu/drm/i915/gt/intel_engine.h | 1 - drivers/gpu/drm/i915/gt/intel_engine_pm.c | 7 +- drivers/gpu/drm/i915/gt/intel_hangcheck.c | 2 +- drivers/gpu/drm/i915/gt/intel_reset.c | 2 +- drivers/gpu/drm/i915/i915_irq.c | 2 + drivers/gpu/drm/i915/i915_request.c | 12 +- drivers/gpu/drm/i915/i915_trace.h | 6 +- drivers/gpu/drm/radeon/radeon_display.c | 2 + drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c | 2 - drivers/hv/hyperv_vmbus.h | 1 + drivers/hv/vmbus_drv.c | 5 +- drivers/leds/trigger/Kconfig | 1 + drivers/md/bcache/Kconfig | 1 + drivers/md/raid5.c | 7 +- drivers/md/raid5.h | 1 + drivers/media/platform/Kconfig | 2 +- .../wireless/intersil/orinoco/orinoco_usb.c | 4 +- drivers/of/base.c | 133 +- drivers/of/dynamic.c | 2 +- drivers/of/of_private.h | 4 +- drivers/of/overlay.c | 10 - drivers/pci/switch/switchtec.c | 22 +- drivers/scsi/fcoe/fcoe.c | 16 +- drivers/scsi/fcoe/fcoe_ctlr.c | 4 +- drivers/scsi/libfc/fc_exch.c | 4 +- drivers/thermal/intel/x86_pkg_temp_thermal.c | 24 +- drivers/tty/serial/8250/8250.h | 47 +- drivers/tty/serial/8250/8250_core.c | 17 +- drivers/tty/serial/8250/8250_fsl.c | 9 + drivers/tty/serial/8250/8250_ingenic.c | 7 + drivers/tty/serial/8250/8250_mtk.c | 29 +- drivers/tty/serial/8250/8250_port.c | 92 +- drivers/tty/serial/amba-pl011.c | 17 +- drivers/tty/serial/omap-serial.c | 12 +- drivers/usb/gadget/function/f_fs.c | 2 +- drivers/usb/gadget/legacy/inode.c | 4 +- drivers/video/backlight/Kconfig | 4 +- drivers/xen/preempt.c | 4 +- fs/afs/dir_silly.c | 2 +- fs/btrfs/volumes.h | 2 +- fs/buffer.c | 21 +- fs/cifs/readdir.c | 2 +- fs/dcache.c | 50 +- fs/eventpoll.c | 4 +- fs/ext4/page-io.c | 8 +- fs/fscache/cookie.c | 8 + fs/fscache/main.c | 1 + fs/fuse/readdir.c | 2 +- fs/inode.c | 2 +- fs/jbd2/commit.c | 13 +- fs/jbd2/journal.c | 30 +- fs/jbd2/transaction.c | 144 +- fs/namei.c | 4 +- fs/namespace.c | 8 +- fs/nfs/delegation.c | 4 +- fs/nfs/dir.c | 12 +- fs/nfs/inode.c | 4 + fs/nfs/nfs4_fs.h | 2 +- fs/nfs/nfs4proc.c | 4 +- fs/nfs/nfs4state.c | 22 +- fs/nfs/unlink.c | 35 +- fs/ntfs/aops.c | 9 +- fs/ocfs2/suballoc.c | 19 +- fs/proc/base.c | 3 +- fs/proc/kmsg.c | 4 +- fs/proc/proc_sysctl.c | 2 +- fs/squashfs/decompressor_multi_percpu.c | 16 +- fs/stack.c | 6 +- fs/userfaultfd.c | 12 +- include/linux/bottom_half.h | 5 + include/linux/bpf.h | 38 +- include/linux/buffer_head.h | 6 +- include/linux/cgroup-defs.h | 5 - include/linux/cgroup.h | 3 - include/linux/completion.h | 8 +- include/linux/console.h | 6 + include/linux/dcache.h | 4 +- include/linux/delay.h | 6 + include/linux/dma-resv.h | 4 +- include/linux/filter.h | 33 +- include/linux/fs.h | 6 +- include/linux/fscache.h | 1 + include/linux/genhd.h | 6 +- include/linux/hardirq.h | 2 - include/linux/highmem.h | 32 +- include/linux/idr.h | 5 +- include/linux/interrupt.h | 8 +- include/linux/irq_work.h | 8 + include/linux/irqdesc.h | 1 + include/linux/irqflags.h | 23 +- include/linux/jbd2.h | 27 +- include/linux/journal-head.h | 21 +- include/linux/kernel.h | 12 + include/linux/kmsg_dump.h | 6 +- include/linux/list_bl.h | 30 +- include/linux/locallock.h | 282 +++ include/linux/mm_types.h | 4 + include/linux/mutex.h | 20 +- include/linux/mutex_rt.h | 130 ++ include/linux/netdevice.h | 1 + include/linux/nfs_fs.h | 4 + include/linux/nfs_xdr.h | 2 +- include/linux/percpu-refcount.h | 16 +- include/linux/percpu-rwsem.h | 83 +- include/linux/percpu.h | 29 + include/linux/pid.h | 1 + include/linux/posix-timers.h | 11 + include/linux/preempt.h | 109 +- include/linux/printk.h | 25 +- include/linux/printk_ringbuffer.h | 114 + include/linux/radix-tree.h | 6 +- include/linux/random.h | 2 +- include/linux/ratelimit.h | 2 +- include/linux/rbtree.h | 2 +- include/linux/rcu_assign_pointer.h | 62 + include/linux/rcupdate.h | 70 +- include/linux/rtmutex.h | 22 +- include/linux/rwlock_rt.h | 119 + include/linux/rwlock_types.h | 4 + include/linux/rwlock_types_rt.h | 55 + include/linux/rwsem-rt.h | 68 + include/linux/rwsem.h | 18 +- include/linux/sched.h | 149 +- include/linux/sched/mm.h | 10 + include/linux/sched/wake_q.h | 13 +- include/linux/seqlock.h | 66 +- include/linux/serial_8250.h | 5 + include/linux/signal.h | 1 + include/linux/skbuff.h | 7 + include/linux/smp.h | 14 +- include/linux/spinlock.h | 12 +- include/linux/spinlock_api_smp.h | 4 +- include/linux/spinlock_rt.h | 156 ++ include/linux/spinlock_types.h | 76 +- include/linux/spinlock_types_nort.h | 33 + include/linux/spinlock_types_raw.h | 55 + include/linux/spinlock_types_rt.h | 48 + include/linux/spinlock_types_up.h | 4 - include/linux/stop_machine.h | 2 + include/linux/swait.h | 16 + include/linux/swap.h | 2 + include/linux/thread_info.h | 12 +- include/linux/trace_events.h | 2 + include/linux/uaccess.h | 2 + include/linux/vmstat.h | 4 + include/linux/wait.h | 2 + include/net/gen_stats.h | 11 +- include/net/neighbour.h | 6 +- include/net/net_seq_lock.h | 15 + include/net/sch_generic.h | 19 +- include/xen/xen-ops.h | 4 +- init/Kconfig | 5 +- init/init_task.c | 4 + kernel/Kconfig.locks | 12 +- kernel/Kconfig.preempt | 6 + kernel/bpf/hashtab.c | 152 +- kernel/bpf/lpm_trie.c | 12 +- kernel/bpf/percpu_freelist.c | 20 +- kernel/bpf/stackmap.c | 17 +- kernel/bpf/syscall.c | 21 +- kernel/bpf/verifier.c | 40 +- kernel/cgroup/cgroup.c | 12 - kernel/cgroup/cpuset.c | 70 +- kernel/cgroup/rstat.c | 55 +- kernel/cpu.c | 48 +- kernel/events/core.c | 4 +- kernel/exit.c | 3 +- kernel/fork.c | 27 +- kernel/futex.c | 108 +- kernel/irq/handle.c | 8 +- kernel/irq/manage.c | 8 +- kernel/irq/spurious.c | 8 + kernel/irq_work.c | 59 +- kernel/kexec_core.c | 1 - kernel/ksysfs.c | 12 + kernel/locking/Makefile | 10 +- kernel/locking/lockdep.c | 2 + kernel/locking/locktorture.c | 1 - kernel/locking/mutex-rt.c | 223 ++ kernel/locking/percpu-rwsem.c | 192 +- kernel/locking/rtmutex.c | 944 +++++++- kernel/locking/rtmutex_common.h | 31 +- kernel/locking/rwlock-rt.c | 384 ++++ kernel/locking/rwsem-rt.c | 302 +++ kernel/locking/rwsem.c | 14 +- kernel/locking/rwsem.h | 10 - kernel/locking/spinlock.c | 7 + kernel/locking/spinlock_debug.c | 5 + kernel/panic.c | 5 +- kernel/printk/Makefile | 1 - kernel/printk/internal.h | 72 - kernel/printk/printk.c | 1915 ++++++++--------- kernel/printk/printk_safe.c | 406 ---- kernel/ptrace.c | 9 +- kernel/rcu/Kconfig | 8 +- kernel/rcu/rcutorture.c | 98 +- kernel/rcu/srcutiny.c | 2 +- kernel/rcu/srcutree.c | 13 +- kernel/rcu/tree.c | 9 +- kernel/rcu/tree_exp.h | 2 +- kernel/rcu/tree_plugin.h | 10 +- kernel/rcu/update.c | 4 +- kernel/sched/completion.c | 34 +- kernel/sched/core.c | 406 +++- kernel/sched/debug.c | 4 + kernel/sched/fair.c | 16 +- kernel/sched/features.h | 8 + kernel/sched/sched.h | 14 + kernel/sched/swait.c | 17 +- kernel/sched/topology.c | 1 + kernel/seccomp.c | 4 +- kernel/signal.c | 105 +- kernel/smp.c | 99 +- kernel/softirq.c | 231 +- kernel/stop_machine.c | 7 +- kernel/sysctl.c | 29 +- kernel/time/hrtimer.c | 34 +- kernel/time/jiffies.c | 7 +- kernel/time/posix-cpu-timers.c | 203 +- kernel/time/tick-common.c | 10 +- kernel/time/tick-sched.c | 29 +- kernel/time/timekeeping.c | 6 +- kernel/time/timekeeping.h | 3 +- kernel/time/timer.c | 2 + kernel/trace/bpf_trace.c | 7 +- kernel/trace/trace.c | 40 +- kernel/trace/trace.h | 2 + kernel/trace/trace_events.c | 2 + kernel/trace/trace_output.c | 19 +- kernel/trace/trace_uprobe.c | 11 +- kernel/up.c | 12 +- kernel/workqueue.c | 181 +- lib/Kconfig.debug | 21 +- lib/Makefile | 2 +- lib/bust_spinlocks.c | 3 +- lib/debugobjects.c | 5 +- lib/irq_poll.c | 5 + lib/locking-selftest.c | 50 + lib/nmi_backtrace.c | 6 - lib/printk_ringbuffer.c | 589 +++++ lib/radix-tree.c | 28 +- lib/scatterlist.c | 2 +- lib/smp_processor_id.c | 7 +- lib/test_bpf.c | 4 +- mm/Kconfig | 2 +- mm/compaction.c | 10 +- mm/highmem.c | 6 +- mm/kmemleak.c | 112 +- mm/memcontrol.c | 28 +- mm/memory.c | 2 +- mm/page_alloc.c | 188 +- mm/slab.c | 90 +- mm/slab.h | 2 +- mm/slub.c | 160 +- mm/swap.c | 74 +- mm/vmalloc.c | 50 +- mm/vmstat.c | 12 + mm/workingset.c | 5 +- mm/zsmalloc.c | 80 +- mm/zswap.c | 23 +- net/Kconfig | 2 +- net/bpf/test_run.c | 8 +- net/core/dev.c | 58 +- net/core/flow_dissector.c | 4 +- net/core/gen_estimator.c | 6 +- net/core/gen_stats.c | 12 +- net/core/skmsg.c | 8 +- net/kcm/kcmsock.c | 4 +- net/packet/af_packet.c | 5 +- net/sched/sch_api.c | 2 +- net/sched/sch_generic.c | 15 +- net/sunrpc/svc_xprt.c | 4 +- security/apparmor/include/path.h | 19 +- security/apparmor/lsm.c | 2 +- virt/kvm/arm/arch_timer.c | 8 +- virt/kvm/arm/arm.c | 6 +- 412 files changed, 9817 insertions(+), 3774 deletions(-) create mode 100644 Documentation/printk-ringbuffer.txt create mode 100644 include/linux/locallock.h create mode 100644 include/linux/mutex_rt.h create mode 100644 include/linux/printk_ringbuffer.h create mode 100644 include/linux/rcu_assign_pointer.h create mode 100644 include/linux/rwlock_rt.h create mode 100644 include/linux/rwlock_types_rt.h create mode 100644 include/linux/rwsem-rt.h create mode 100644 include/linux/spinlock_rt.h create mode 100644 include/linux/spinlock_types_nort.h create mode 100644 include/linux/spinlock_types_raw.h create mode 100644 include/linux/spinlock_types_rt.h create mode 100644 include/net/net_seq_lock.h create mode 100644 kernel/locking/mutex-rt.c create mode 100644 kernel/locking/rwlock-rt.c create mode 100644 kernel/locking/rwsem-rt.c delete mode 100644 kernel/printk/internal.h delete mode 100644 kernel/printk/printk_safe.c create mode 100644 lib/printk_ringbuffer.c diff --git a/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.html b/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.html index 57300db4b5ff6..31c99382994e0 100644 --- a/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.html +++ b/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.html @@ -56,8 +56,8 @@

RCU-preempt Expedited Grace Periods

-CONFIG_PREEMPT=y kernels implement RCU-preempt. -The overall flow of the handling of a given CPU by an RCU-preempt +CONFIG_PREEMPT=y and CONFIG_PREEMPT_RT=y kernels implement +RCU-preempt. The overall flow of the handling of a given CPU by an RCU-preempt expedited grace period is shown in the following diagram:

ExpRCUFlow.svg @@ -140,8 +140,8 @@

RCU-sched Expedited Grace Periods

-CONFIG_PREEMPT=n kernels implement RCU-sched. -The overall flow of the handling of a given CPU by an RCU-sched +CONFIG_PREEMPT=n and CONFIG_PREEMPT_RT=n kernels implement +RCU-sched. The overall flow of the handling of a given CPU by an RCU-sched expedited grace period is shown in the following diagram:

ExpSchedFlow.svg diff --git a/Documentation/RCU/Design/Requirements/Requirements.html b/Documentation/RCU/Design/Requirements/Requirements.html index 467251f7fef69..348c5db1ff2bb 100644 --- a/Documentation/RCU/Design/Requirements/Requirements.html +++ b/Documentation/RCU/Design/Requirements/Requirements.html @@ -106,7 +106,7 @@

Grace-Period Guarantee

Production-quality implementations of rcu_read_lock() and rcu_read_unlock() are extremely lightweight, and in fact have exactly zero overhead in Linux kernels built for production -use with CONFIG_PREEMPT=n. +use with CONFIG_PREEMPTION=n.

This guarantee allows ordering to be enforced with extremely low @@ -1499,7 +1499,7 @@

Performance and Scalability

However, as I learned from Matt Mackall's bloatwatch efforts, memory footprint is critically important on single-CPU systems with -non-preemptible (CONFIG_PREEMPT=n) kernels, and thus +non-preemptible (CONFIG_PREEMPTION=n) kernels, and thus tiny RCU was born. Josh Triplett has since taken over the small-memory banner with his @@ -1887,7 +1887,7 @@

Composability

Implementations of RCU for which rcu_read_lock() and rcu_read_unlock() generate no code, such as -Linux-kernel RCU when CONFIG_PREEMPT=n, can be +Linux-kernel RCU when CONFIG_PREEMPTION=n, can be nested arbitrarily deeply. After all, there is no overhead. Except that if all these instances of rcu_read_lock() @@ -2229,7 +2229,7 @@

Early Boot

However, once the scheduler has spawned its first kthread, this early boot trick fails for synchronize_rcu() (as well as for -synchronize_rcu_expedited()) in CONFIG_PREEMPT=y +synchronize_rcu_expedited()) in CONFIG_PREEMPTION=y kernels. The reason is that an RCU read-side critical section might be preempted, which means that a subsequent synchronize_rcu() really does have @@ -2568,7 +2568,7 @@

If the compiler did make this transformation in a -CONFIG_PREEMPT=n kernel build, and if get_user() did +CONFIG_PREEMPTION=n kernel build, and if get_user() did page fault, the result would be a quiescent state in the middle of an RCU read-side critical section. This misplaced quiescent state could result in line 4 being @@ -2906,7 +2906,7 @@

The real-time-latency response requirements are such that the traditional approach of disabling preemption across RCU read-side critical sections is inappropriate. -Kernels built with CONFIG_PREEMPT=y therefore +Kernels built with CONFIG_PREEMPTION=y therefore use an RCU implementation that allows RCU read-side critical sections to be preempted. This requirement made its presence known after users made it @@ -3064,7 +3064,7 @@

Bottom-Half Flavor (Historical)

rcu_barrier_bh(), and rcu_read_lock_bh_held(). However, the update-side APIs are now simple wrappers for other RCU -flavors, namely RCU-sched in CONFIG_PREEMPT=n kernels and RCU-preempt +flavors, namely RCU-sched in CONFIG_PREEMPTION=n kernels and RCU-preempt otherwise.

Sched Flavor (Historical)

@@ -3088,12 +3088,12 @@

Sched Flavor (Historical)

Therefore, RCU-sched was created, which follows “classic” RCU in that an RCU-sched grace period waits for for pre-existing interrupt and NMI handlers. -In kernels built with CONFIG_PREEMPT=n, the RCU and RCU-sched +In kernels built with CONFIG_PREEMPTION=n, the RCU and RCU-sched APIs have identical implementations, while kernels built with -CONFIG_PREEMPT=y provide a separate implementation for each. +CONFIG_PREEMPTION=y provide a separate implementation for each.

-Note well that in CONFIG_PREEMPT=y kernels, +Note well that in CONFIG_PREEMPTION=y kernels, rcu_read_lock_sched() and rcu_read_unlock_sched() disable and re-enable preemption, respectively. This means that if there was a preemption attempt during the @@ -3302,12 +3302,12 @@

Tasks RCU

call_rcu_tasks(), synchronize_rcu_tasks(), and rcu_barrier_tasks(). -In CONFIG_PREEMPT=n kernels, trampolines cannot be preempted, +In CONFIG_PREEMPTION=n kernels, trampolines cannot be preempted, so these APIs map to call_rcu(), synchronize_rcu(), and rcu_barrier(), respectively. -In CONFIG_PREEMPT=y kernels, trampolines can be preempted, +In CONFIG_PREEMPTION=y kernels, trampolines can be preempted, and these three APIs are therefore implemented by separate functions that check for voluntary context switches. diff --git a/Documentation/RCU/checklist.txt b/Documentation/RCU/checklist.txt index e98ff261a438b..087dc6c22c37c 100644 --- a/Documentation/RCU/checklist.txt +++ b/Documentation/RCU/checklist.txt @@ -210,8 +210,8 @@ over a rather long period of time, but improvements are always welcome! the rest of the system. 7. As of v4.20, a given kernel implements only one RCU flavor, - which is RCU-sched for PREEMPT=n and RCU-preempt for PREEMPT=y. - If the updater uses call_rcu() or synchronize_rcu(), + which is RCU-sched for PREEMPTION=n and RCU-preempt for + PREEMPTION=y. If the updater uses call_rcu() or synchronize_rcu(), then the corresponding readers my use rcu_read_lock() and rcu_read_unlock(), rcu_read_lock_bh() and rcu_read_unlock_bh(), or any pair of primitives that disables and re-enables preemption, diff --git a/Documentation/RCU/rcubarrier.txt b/Documentation/RCU/rcubarrier.txt index a2782df697328..5aa93c215af46 100644 --- a/Documentation/RCU/rcubarrier.txt +++ b/Documentation/RCU/rcubarrier.txt @@ -6,8 +6,8 @@ RCU (read-copy update) is a synchronization mechanism that can be thought of as a replacement for read-writer locking (among other things), but with very low-overhead readers that are immune to deadlock, priority inversion, and unbounded latency. RCU read-side critical sections are delimited -by rcu_read_lock() and rcu_read_unlock(), which, in non-CONFIG_PREEMPT -kernels, generate no code whatsoever. +by rcu_read_lock() and rcu_read_unlock(), which, in +non-CONFIG_PREEMPTION kernels, generate no code whatsoever. This means that RCU writers are unaware of the presence of concurrent readers, so that RCU updates to shared data must be undertaken quite @@ -303,10 +303,10 @@ Answer: This cannot happen. The reason is that on_each_cpu() has its last to smp_call_function() and further to smp_call_function_on_cpu(), causing this latter to spin until the cross-CPU invocation of rcu_barrier_func() has completed. This by itself would prevent - a grace period from completing on non-CONFIG_PREEMPT kernels, + a grace period from completing on non-CONFIG_PREEMPTION kernels, since each CPU must undergo a context switch (or other quiescent state) before the grace period can complete. However, this is - of no use in CONFIG_PREEMPT kernels. + of no use in CONFIG_PREEMPTION kernels. Therefore, on_each_cpu() disables preemption across its call to smp_call_function() and also across the local call to diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt index f48f4621ccbc2..bd510771b75ec 100644 --- a/Documentation/RCU/stallwarn.txt +++ b/Documentation/RCU/stallwarn.txt @@ -20,7 +20,7 @@ o A CPU looping with preemption disabled. o A CPU looping with bottom halves disabled. -o For !CONFIG_PREEMPT kernels, a CPU looping anywhere in the kernel +o For !CONFIG_PREEMPTION kernels, a CPU looping anywhere in the kernel without invoking schedule(). If the looping in the kernel is really expected and desirable behavior, you might need to add some calls to cond_resched(). @@ -39,7 +39,7 @@ o Anything that prevents RCU's grace-period kthreads from running. result in the "rcu_.*kthread starved for" console-log message, which will include additional debugging information. -o A CPU-bound real-time task in a CONFIG_PREEMPT kernel, which might +o A CPU-bound real-time task in a CONFIG_PREEMPTION kernel, which might happen to preempt a low-priority task in the middle of an RCU read-side critical section. This is especially damaging if that low-priority task is not permitted to run on any other CPU, diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt index 7e1a8721637ab..7e03e8f80b293 100644 --- a/Documentation/RCU/whatisRCU.txt +++ b/Documentation/RCU/whatisRCU.txt @@ -648,9 +648,10 @@ Quick Quiz #1: Why is this argument naive? How could a deadlock This section presents a "toy" RCU implementation that is based on "classic RCU". It is also short on performance (but only for updates) and -on features such as hotplug CPU and the ability to run in CONFIG_PREEMPT -kernels. The definitions of rcu_dereference() and rcu_assign_pointer() -are the same as those shown in the preceding section, so they are omitted. +on features such as hotplug CPU and the ability to run in +CONFIG_PREEMPTION kernels. The definitions of rcu_dereference() and +rcu_assign_pointer() are the same as those shown in the preceding +section, so they are omitted. void rcu_read_lock(void) { } diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst index 64aeee1009cab..0329a4d3fa9ec 100644 --- a/Documentation/admin-guide/sysctl/vm.rst +++ b/Documentation/admin-guide/sysctl/vm.rst @@ -128,6 +128,9 @@ allowed to examine the unevictable lru (mlocked pages) for pages to compact. This should be used on systems where stalls for minor page faults are an acceptable trade for large contiguous free memory. Set to 0 to prevent compaction from moving pages that are unevictable. Default value is 1. +On CONFIG_PREEMPT_RT the default value is 0 in order to avoid a page fault, due +to compaction, which would block the task from becomming active until the fault +is resolved. dirty_background_bytes diff --git a/Documentation/printk-ringbuffer.txt b/Documentation/printk-ringbuffer.txt new file mode 100644 index 0000000000000..6bde5dbd8545b --- /dev/null +++ b/Documentation/printk-ringbuffer.txt @@ -0,0 +1,377 @@ +struct printk_ringbuffer +------------------------ +John Ogness + +Overview +~~~~~~~~ +As the name suggests, this ring buffer was implemented specifically to serve +the needs of the printk() infrastructure. The ring buffer itself is not +specific to printk and could be used for other purposes. _However_, the +requirements and semantics of printk are rather unique. If you intend to use +this ring buffer for anything other than printk, you need to be very clear on +its features, behavior, and pitfalls. + +Features +^^^^^^^^ +The printk ring buffer has the following features: + +- single global buffer +- resides in initialized data section (available at early boot) +- lockless readers +- supports multiple writers +- supports multiple non-consuming readers +- safe from any context (including NMI) +- groups bytes into variable length blocks (referenced by entries) +- entries tagged with sequence numbers + +Behavior +^^^^^^^^ +Since the printk ring buffer readers are lockless, there exists no +synchronization between readers and writers. Basically writers are the tasks +in control and may overwrite any and all committed data at any time and from +any context. For this reason readers can miss entries if they are overwritten +before the reader was able to access the data. The reader API implementation +is such that reader access to entries is atomic, so there is no risk of +readers having to deal with partial or corrupt data. Also, entries are +tagged with sequence numbers so readers can recognize if entries were missed. + +Writing to the ring buffer consists of 2 steps. First a writer must reserve +an entry of desired size. After this step the writer has exclusive access +to the memory region. Once the data has been written to memory, it needs to +be committed to the ring buffer. After this step the entry has been inserted +into the ring buffer and assigned an appropriate sequence number. + +Once committed, a writer must no longer access the data directly. This is +because the data may have been overwritten and no longer exists. If a +writer must access the data, it should either keep a private copy before +committing the entry or use the reader API to gain access to the data. + +Because of how the data backend is implemented, entries that have been +reserved but not yet committed act as barriers, preventing future writers +from filling the ring buffer beyond the location of the reserved but not +yet committed entry region. For this reason it is *important* that writers +perform both reserve and commit as quickly as possible. Also, be aware that +preemption and local interrupts are disabled and writing to the ring buffer +is processor-reentrant locked during the reserve/commit window. Writers in +NMI contexts can still preempt any other writers, but as long as these +writers do not write a large amount of data with respect to the ring buffer +size, this should not become an issue. + +API +~~~ + +Declaration +^^^^^^^^^^^ +The printk ring buffer can be instantiated as a static structure: + + /* declare a static struct printk_ringbuffer */ + #define DECLARE_STATIC_PRINTKRB(name, szbits, cpulockptr) + +The value of szbits specifies the size of the ring buffer in bits. The +cpulockptr field is a pointer to a prb_cpulock struct that is used to +perform processor-reentrant spin locking for the writers. It is specified +externally because it may be used for multiple ring buffers (or other +code) to synchronize writers without risk of deadlock. + +Here is an example of a declaration of a printk ring buffer specifying a +32KB (2^15) ring buffer: + +.... +DECLARE_STATIC_PRINTKRB_CPULOCK(rb_cpulock); +DECLARE_STATIC_PRINTKRB(rb, 15, &rb_cpulock); +.... + +If writers will be using multiple ring buffers and the ordering of that usage +is not clear, the same prb_cpulock should be used for both ring buffers. + +Writer API +^^^^^^^^^^ +The writer API consists of 2 functions. The first is to reserve an entry in +the ring buffer, the second is to commit that data to the ring buffer. The +reserved entry information is stored within a provided `struct prb_handle`. + + /* reserve an entry */ + char *prb_reserve(struct prb_handle *h, struct printk_ringbuffer *rb, + unsigned int size); + + /* commit a reserved entry to the ring buffer */ + void prb_commit(struct prb_handle *h); + +Here is an example of a function to write data to a ring buffer: + +.... +int write_data(struct printk_ringbuffer *rb, char *data, int size) +{ + struct prb_handle h; + char *buf; + + buf = prb_reserve(&h, rb, size); + if (!buf) + return -1; + memcpy(buf, data, size); + prb_commit(&h); + + return 0; +} +.... + +Pitfalls +++++++++ +Be aware that prb_reserve() can fail. A retry might be successful, but it +depends entirely on whether or not the next part of the ring buffer to +overwrite belongs to reserved but not yet committed entries of other writers. +Writers can use the prb_inc_lost() function to allow readers to notice that a +message was lost. + +Reader API +^^^^^^^^^^ +The reader API utilizes a `struct prb_iterator` to track the reader's +position in the ring buffer. + + /* declare a pre-initialized static iterator for a ring buffer */ + #define DECLARE_STATIC_PRINTKRB_ITER(name, rbaddr) + + /* initialize iterator for a ring buffer (if static macro NOT used) */ + void prb_iter_init(struct prb_iterator *iter, + struct printk_ringbuffer *rb, u64 *seq); + + /* make a deep copy of an iterator */ + void prb_iter_copy(struct prb_iterator *dest, + struct prb_iterator *src); + + /* non-blocking, advance to next entry (and read the data) */ + int prb_iter_next(struct prb_iterator *iter, char *buf, + int size, u64 *seq); + + /* blocking, advance to next entry (and read the data) */ + int prb_iter_wait_next(struct prb_iterator *iter, char *buf, + int size, u64 *seq); + + /* position iterator at the entry seq */ + int prb_iter_seek(struct prb_iterator *iter, u64 seq); + + /* read data at current position */ + int prb_iter_data(struct prb_iterator *iter, char *buf, + int size, u64 *seq); + +Typically prb_iter_data() is not needed because the data can be retrieved +directly with prb_iter_next(). + +Here is an example of a non-blocking function that will read all the data in +a ring buffer: + +.... +void read_all_data(struct printk_ringbuffer *rb, char *buf, int size) +{ + struct prb_iterator iter; + u64 prev_seq = 0; + u64 seq; + int ret; + + prb_iter_init(&iter, rb, NULL); + + for (;;) { + ret = prb_iter_next(&iter, buf, size, &seq); + if (ret > 0) { + if (seq != ++prev_seq) { + /* "seq - prev_seq" entries missed */ + prev_seq = seq; + } + /* process buf here */ + } else if (ret == 0) { + /* hit the end, done */ + break; + } else if (ret < 0) { + /* + * iterator is invalid, a writer overtook us, reset the + * iterator and keep going, entries were missed + */ + prb_iter_init(&iter, rb, NULL); + } + } +} +.... + +Pitfalls +++++++++ +The reader's iterator can become invalid at any time because the reader was +overtaken by a writer. Typically the reader should reset the iterator back +to the current oldest entry (which will be newer than the entry the reader +was at) and continue, noting the number of entries that were missed. + +Utility API +^^^^^^^^^^^ +Several functions are available as convenience for external code. + + /* query the size of the data buffer */ + int prb_buffer_size(struct printk_ringbuffer *rb); + + /* skip a seq number to signify a lost record */ + void prb_inc_lost(struct printk_ringbuffer *rb); + + /* processor-reentrant spin lock */ + void prb_lock(struct prb_cpulock *cpu_lock, unsigned int *cpu_store); + + /* processor-reentrant spin unlock */ + void prb_lock(struct prb_cpulock *cpu_lock, unsigned int *cpu_store); + +Pitfalls +++++++++ +Although the value returned by prb_buffer_size() does represent an absolute +upper bound, the amount of data that can be stored within the ring buffer +is actually less because of the additional storage space of a header for each +entry. + +The prb_lock() and prb_unlock() functions can be used to synchronize between +ring buffer writers and other external activities. The function of a +processor-reentrant spin lock is to disable preemption and local interrupts +and synchronize against other processors. It does *not* protect against +multiple contexts of a single processor, i.e NMI. + +Implementation +~~~~~~~~~~~~~~ +This section describes several of the implementation concepts and details to +help developers better understand the code. + +Entries +^^^^^^^ +All ring buffer data is stored within a single static byte array. The reason +for this is to ensure that any pointers to the data (past and present) will +always point to valid memory. This is important because the lockless readers +may be preempted for long periods of time and when they resume may be working +with expired pointers. + +Entries are identified by start index and size. (The start index plus size +is the start index of the next entry.) The start index is not simply an +offset into the byte array, but rather a logical position (lpos) that maps +directly to byte array offsets. + +For example, for a byte array of 1000, an entry may have have a start index +of 100. Another entry may have a start index of 1100. And yet another 2100. +All of these entry are pointing to the same memory region, but only the most +recent entry is valid. The other entries are pointing to valid memory, but +represent entries that have been overwritten. + +Note that due to overflowing, the most recent entry is not necessarily the one +with the highest lpos value. Indeed, the printk ring buffer initializes its +data such that an overflow happens relatively quickly in order to validate the +handling of this situation. The implementation assumes that an lpos (unsigned +long) will never completely wrap while a reader is preempted. If this were to +become an issue, the seq number (which never wraps) could be used to increase +the robustness of handling this situation. + +Buffer Wrapping +^^^^^^^^^^^^^^^ +If an entry starts near the end of the byte array but would extend beyond it, +a special terminating entry (size = -1) is inserted into the byte array and +the real entry is placed at the beginning of the byte array. This can waste +space at the end of the byte array, but simplifies the implementation by +allowing writers to always work with contiguous buffers. + +Note that the size field is the first 4 bytes of the entry header. Also note +that calc_next() always ensures that there are at least 4 bytes left at the +end of the byte array to allow room for a terminating entry. + +Ring Buffer Pointers +^^^^^^^^^^^^^^^^^^^^ +Three pointers (lpos values) are used to manage the ring buffer: + + - _tail_: points to the oldest entry + - _head_: points to where the next new committed entry will be + - _reserve_: points to where the next new reserved entry will be + +These pointers always maintain a logical ordering: + + tail <= head <= reserve + +The reserve pointer moves forward when a writer reserves a new entry. The +head pointer moves forward when a writer commits a new entry. + +The reserve pointer cannot overwrite the tail pointer in a wrap situation. In +such a situation, the tail pointer must be "pushed forward", thus +invalidating that oldest entry. Readers identify if they are accessing a +valid entry by ensuring their entry pointer is `>= tail && < head`. + +If the tail pointer is equal to the head pointer, it cannot be pushed and any +reserve operation will fail. The only resolution is for writers to commit +their reserved entries. + +Processor-Reentrant Locking +^^^^^^^^^^^^^^^^^^^^^^^^^^^ +The purpose of the processor-reentrant locking is to limit the interruption +scenarios of writers to 2 contexts. This allows for a simplified +implementation where: + +- The reserve/commit window only exists on 1 processor at a time. A reserve + can never fail due to uncommitted entries of other processors. + +- When committing entries, it is trivial to handle the situation when + subsequent entries have already been committed, i.e. managing the head + pointer. + +Performance +~~~~~~~~~~~ +Some basic tests were performed on a quad Intel(R) Xeon(R) CPU E5-2697 v4 at +2.30GHz (36 cores / 72 threads). All tests involved writing a total of +32,000,000 records at an average of 33 bytes each. Each writer was pinned to +its own CPU and would write as fast as it could until a total of 32,000,000 +records were written. All tests involved 2 readers that were both pinned +together to another CPU. Each reader would read as fast as it could and track +how many of the 32,000,000 records it could read. All tests used a ring buffer +of 16KB in size, which holds around 350 records (header + data for each +entry). + +The only difference between the tests is the number of writers (and thus also +the number of records per writer). As more writers are added, the time to +write a record increases. This is because data pointers, modified via cmpxchg, +and global data access in general become more contended. + +1 writer +^^^^^^^^ + runtime: 0m 18s + reader1: 16219900/32000000 (50%) records + reader2: 16141582/32000000 (50%) records + +2 writers +^^^^^^^^^ + runtime: 0m 32s + reader1: 16327957/32000000 (51%) records + reader2: 16313988/32000000 (50%) records + +4 writers +^^^^^^^^^ + runtime: 0m 42s + reader1: 16421642/32000000 (51%) records + reader2: 16417224/32000000 (51%) records + +8 writers +^^^^^^^^^ + runtime: 0m 43s + reader1: 16418300/32000000 (51%) records + reader2: 16432222/32000000 (51%) records + +16 writers +^^^^^^^^^^ + runtime: 0m 54s + reader1: 16539189/32000000 (51%) records + reader2: 16542711/32000000 (51%) records + +32 writers +^^^^^^^^^^ + runtime: 1m 13s + reader1: 16731808/32000000 (52%) records + reader2: 16735119/32000000 (52%) records + +Comments +^^^^^^^^ +It is particularly interesting to compare/contrast the 1-writer and 32-writer +tests. Despite the writing of the 32,000,000 records taking over 4 times +longer, the readers (which perform no cmpxchg) were still unable to keep up. +This shows that the memory contention between the increasing number of CPUs +also has a dramatic effect on readers. + +It should also be noted that in all cases each reader was able to read >=50% +of the records. This means that a single reader would have been able to keep +up with the writer(s) in all cases, becoming slightly easier as more writers +are added. This was the purpose of pinning 2 readers to 1 CPU: to observe how +maximum reader performance changes. diff --git a/Documentation/trace/ftrace-uses.rst b/Documentation/trace/ftrace-uses.rst index 1fbc69894eed0..1e0020b0bc745 100644 --- a/Documentation/trace/ftrace-uses.rst +++ b/Documentation/trace/ftrace-uses.rst @@ -146,7 +146,7 @@ FTRACE_OPS_FL_RECURSION_SAFE itself or any nested functions that those functions call. If this flag is set, it is possible that the callback will also - be called with preemption enabled (when CONFIG_PREEMPT is set), + be called with preemption enabled (when CONFIG_PREEMPTION is set), but this is not guaranteed. FTRACE_OPS_FL_IPMODIFY diff --git a/arch/Kconfig b/arch/Kconfig index 238dccfa76910..a886cbe86efc2 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -31,6 +31,7 @@ config OPROFILE tristate "OProfile system profiling" depends on PROFILING depends on HAVE_OPROFILE + depends on !PREEMPT_RT select RING_BUFFER select RING_BUFFER_ALLOW_SWAP help diff --git a/arch/alpha/include/asm/spinlock_types.h b/arch/alpha/include/asm/spinlock_types.h index 1d5716bc060be..6883bc952d224 100644 --- a/arch/alpha/include/asm/spinlock_types.h +++ b/arch/alpha/include/asm/spinlock_types.h @@ -2,10 +2,6 @@ #ifndef _ALPHA_SPINLOCK_TYPES_H #define _ALPHA_SPINLOCK_TYPES_H -#ifndef __LINUX_SPINLOCK_TYPES_H -# error "please don't include this file directly" -#endif - typedef struct { volatile unsigned int lock; } arch_spinlock_t; diff --git a/arch/arc/kernel/entry.S b/arch/arc/kernel/entry.S index ea74a1eee5d9d..29a45c2f0b179 100644 --- a/arch/arc/kernel/entry.S +++ b/arch/arc/kernel/entry.S @@ -331,11 +331,11 @@ resume_user_mode_begin: resume_kernel_mode: ; Disable Interrupts from this point on - ; CONFIG_PREEMPT: This is a must for preempt_schedule_irq() - ; !CONFIG_PREEMPT: To ensure restore_regs is intr safe + ; CONFIG_PREEMPTION: This is a must for preempt_schedule_irq() + ; !CONFIG_PREEMPTION: To ensure restore_regs is intr safe IRQ_DISABLE r9 -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION ; Can't preempt if preemption disabled GET_CURR_THR_INFO_FROM_SP r10 diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index fac9999d6ef5d..eec3055772986 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -31,6 +31,7 @@ config ARM select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX select ARCH_OPTIONAL_KERNEL_RWX_DEFAULT if CPU_V7 select ARCH_SUPPORTS_ATOMIC_RMW + select ARCH_SUPPORTS_RT select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_CMPXCHG_LOCKREF select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU @@ -63,7 +64,7 @@ config ARM select HARDIRQS_SW_RESEND select HAVE_ARCH_AUDITSYSCALL if AEABI && !OABI_COMPAT select HAVE_ARCH_BITREVERSE if (CPU_32v7M || CPU_32v7) && !CPU_32v6 - select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU + select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU && !PREEMPT_RT select HAVE_ARCH_KGDB if !CPU_ENDIAN_BE32 && MMU select HAVE_ARCH_MMAP_RND_BITS if MMU select HAVE_ARCH_SECCOMP_FILTER if AEABI && !OABI_COMPAT @@ -102,6 +103,7 @@ config ARM select HAVE_PERF_EVENTS select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP + select HAVE_PREEMPT_LAZY select HAVE_RCU_TABLE_FREE if SMP && ARM_LPAE select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_RSEQ diff --git a/arch/arm/include/asm/irq.h b/arch/arm/include/asm/irq.h index 46d41140df27d..c421b5b81946e 100644 --- a/arch/arm/include/asm/irq.h +++ b/arch/arm/include/asm/irq.h @@ -23,6 +23,8 @@ #endif #ifndef __ASSEMBLY__ +#include + struct irqaction; struct pt_regs; diff --git a/arch/arm/include/asm/spinlock_types.h b/arch/arm/include/asm/spinlock_types.h index 5976958647fe1..a37c0803954ba 100644 --- a/arch/arm/include/asm/spinlock_types.h +++ b/arch/arm/include/asm/spinlock_types.h @@ -2,10 +2,6 @@ #ifndef __ASM_SPINLOCK_TYPES_H #define __ASM_SPINLOCK_TYPES_H -#ifndef __LINUX_SPINLOCK_TYPES_H -# error "please don't include this file directly" -#endif - #define TICKET_SHIFT 16 typedef struct { diff --git a/arch/arm/include/asm/switch_to.h b/arch/arm/include/asm/switch_to.h index d3e937dcee4d0..285e6248454fc 100644 --- a/arch/arm/include/asm/switch_to.h +++ b/arch/arm/include/asm/switch_to.h @@ -4,13 +4,20 @@ #include +#if defined CONFIG_PREEMPT_RT && defined CONFIG_HIGHMEM +void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p); +#else +static inline void +switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p) { } +#endif + /* * For v7 SMP cores running a preemptible kernel we may be pre-empted * during a TLB maintenance operation, so execute an inner-shareable dsb * to ensure that the maintenance completes in case we migrate to another * CPU. */ -#if defined(CONFIG_PREEMPT) && defined(CONFIG_SMP) && defined(CONFIG_CPU_V7) +#if defined(CONFIG_PREEMPTION) && defined(CONFIG_SMP) && defined(CONFIG_CPU_V7) #define __complete_pending_tlbi() dsb(ish) #else #define __complete_pending_tlbi() @@ -26,6 +33,7 @@ extern struct task_struct *__switch_to(struct task_struct *, struct thread_info #define switch_to(prev,next,last) \ do { \ __complete_pending_tlbi(); \ + switch_kmaps(prev, next); \ last = __switch_to(prev,task_thread_info(prev), task_thread_info(next)); \ } while (0) diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h index 0d0d5178e2c32..13be9c2137e26 100644 --- a/arch/arm/include/asm/thread_info.h +++ b/arch/arm/include/asm/thread_info.h @@ -46,6 +46,7 @@ struct cpu_context_save { struct thread_info { unsigned long flags; /* low level flags */ int preempt_count; /* 0 => preemptable, <0 => bug */ + int preempt_lazy_count; /* 0 => preemptable, <0 => bug */ mm_segment_t addr_limit; /* address limit */ struct task_struct *task; /* main task structure */ __u32 cpu; /* cpu */ @@ -139,7 +140,8 @@ extern int vfp_restore_user_hwstate(struct user_vfp *, #define TIF_SYSCALL_TRACE 4 /* syscall trace active */ #define TIF_SYSCALL_AUDIT 5 /* syscall auditing active */ #define TIF_SYSCALL_TRACEPOINT 6 /* syscall tracepoint instrumentation */ -#define TIF_SECCOMP 7 /* seccomp syscall filtering active */ +#define TIF_SECCOMP 8 /* seccomp syscall filtering active */ +#define TIF_NEED_RESCHED_LAZY 7 #define TIF_NOHZ 12 /* in adaptive nohz mode */ #define TIF_USING_IWMMXT 17 @@ -149,6 +151,7 @@ extern int vfp_restore_user_hwstate(struct user_vfp *, #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY) #define _TIF_UPROBE (1 << TIF_UPROBE) #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) @@ -164,7 +167,8 @@ extern int vfp_restore_user_hwstate(struct user_vfp *, * Change these and you break ASM code in entry-common.S */ #define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ - _TIF_NOTIFY_RESUME | _TIF_UPROBE) + _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ + _TIF_NEED_RESCHED_LAZY) #endif /* __KERNEL__ */ #endif /* __ASM_ARM_THREAD_INFO_H */ diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c index c773b829ee8ee..f3a0e1cd1f048 100644 --- a/arch/arm/kernel/asm-offsets.c +++ b/arch/arm/kernel/asm-offsets.c @@ -53,6 +53,7 @@ int main(void) BLANK(); DEFINE(TI_FLAGS, offsetof(struct thread_info, flags)); DEFINE(TI_PREEMPT, offsetof(struct thread_info, preempt_count)); + DEFINE(TI_PREEMPT_LAZY, offsetof(struct thread_info, preempt_lazy_count)); DEFINE(TI_ADDR_LIMIT, offsetof(struct thread_info, addr_limit)); DEFINE(TI_TASK, offsetof(struct thread_info, task)); DEFINE(TI_CPU, offsetof(struct thread_info, cpu)); diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S index a874b753397ec..1e689a727cb9b 100644 --- a/arch/arm/kernel/entry-armv.S +++ b/arch/arm/kernel/entry-armv.S @@ -204,13 +204,20 @@ __irq_svc: svc_entry irq_handler -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION ldr r8, [tsk, #TI_PREEMPT] @ get preempt count - ldr r0, [tsk, #TI_FLAGS] @ get flags teq r8, #0 @ if preempt count != 0 + bne 1f @ return from exeption + ldr r0, [tsk, #TI_FLAGS] @ get flags + tst r0, #_TIF_NEED_RESCHED @ if NEED_RESCHED is set + blne svc_preempt @ preempt! + + ldr r8, [tsk, #TI_PREEMPT_LAZY] @ get preempt lazy count + teq r8, #0 @ if preempt lazy count != 0 movne r0, #0 @ force flags to 0 - tst r0, #_TIF_NEED_RESCHED + tst r0, #_TIF_NEED_RESCHED_LAZY blne svc_preempt +1: #endif svc_exit r5, irq = 1 @ return from exception @@ -219,14 +226,20 @@ ENDPROC(__irq_svc) .ltorg -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION svc_preempt: mov r8, lr 1: bl preempt_schedule_irq @ irq en/disable is done inside ldr r0, [tsk, #TI_FLAGS] @ get new tasks TI_FLAGS tst r0, #_TIF_NEED_RESCHED + bne 1b + tst r0, #_TIF_NEED_RESCHED_LAZY reteq r8 @ go again - b 1b + ldr r0, [tsk, #TI_PREEMPT_LAZY] @ get preempt lazy count + teq r0, #0 @ if preempt lazy count != 0 + beq 1b + ret r8 @ go again + #endif __und_fault: diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S index 271cb8a1eba1e..fd039b1b37319 100644 --- a/arch/arm/kernel/entry-common.S +++ b/arch/arm/kernel/entry-common.S @@ -53,7 +53,9 @@ __ret_fast_syscall: cmp r2, #TASK_SIZE blne addr_limit_check_failed ldr r1, [tsk, #TI_FLAGS] @ re-check for syscall tracing - tst r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK + tst r1, #((_TIF_SYSCALL_WORK | _TIF_WORK_MASK) & ~_TIF_SECCOMP) + bne fast_work_pending + tst r1, #_TIF_SECCOMP bne fast_work_pending @@ -90,8 +92,11 @@ __ret_fast_syscall: cmp r2, #TASK_SIZE blne addr_limit_check_failed ldr r1, [tsk, #TI_FLAGS] @ re-check for syscall tracing - tst r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK + tst r1, #((_TIF_SYSCALL_WORK | _TIF_WORK_MASK) & ~_TIF_SECCOMP) + bne do_slower_path + tst r1, #_TIF_SECCOMP beq no_work_pending +do_slower_path: UNWIND(.fnend ) ENDPROC(ret_fast_syscall) diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c index ab2568996ddb0..50d43ce823bfb 100644 --- a/arch/arm/kernel/signal.c +++ b/arch/arm/kernel/signal.c @@ -649,7 +649,8 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall) */ trace_hardirqs_off(); do { - if (likely(thread_flags & _TIF_NEED_RESCHED)) { + if (likely(thread_flags & (_TIF_NEED_RESCHED | + _TIF_NEED_RESCHED_LAZY))) { schedule(); } else { if (unlikely(!user_mode(regs))) diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index 46e1be9e57a81..581caf6493a94 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -682,11 +682,9 @@ void handle_IPI(int ipinr, struct pt_regs *regs) break; case IPI_CPU_BACKTRACE: - printk_nmi_enter(); irq_enter(); nmi_cpu_backtrace(regs); irq_exit(); - printk_nmi_exit(); break; default: diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c index c053abd1fb539..abb7dd7e656fd 100644 --- a/arch/arm/kernel/traps.c +++ b/arch/arm/kernel/traps.c @@ -248,6 +248,8 @@ void show_stack(struct task_struct *tsk, unsigned long *sp) #ifdef CONFIG_PREEMPT #define S_PREEMPT " PREEMPT" +#elif defined(CONFIG_PREEMPT_RT) +#define S_PREEMPT " PREEMPT_RT" #else #define S_PREEMPT "" #endif diff --git a/arch/arm/mm/cache-v7.S b/arch/arm/mm/cache-v7.S index 0ee8fc4b4672c..dc8f152f35566 100644 --- a/arch/arm/mm/cache-v7.S +++ b/arch/arm/mm/cache-v7.S @@ -135,13 +135,13 @@ flush_levels: and r1, r1, #7 @ mask of the bits for current cache only cmp r1, #2 @ see what cache we have at this level blt skip @ skip if no cache, or just i-cache -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION save_and_disable_irqs_notrace r9 @ make cssr&csidr read atomic #endif mcr p15, 2, r10, c0, c0, 0 @ select current cache level in cssr isb @ isb to sych the new cssr&csidr mrc p15, 1, r1, c0, c0, 0 @ read the new csidr -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION restore_irqs_notrace r9 #endif and r2, r1, #7 @ extract the length of the cache lines diff --git a/arch/arm/mm/cache-v7m.S b/arch/arm/mm/cache-v7m.S index a0035c426ce63..1bc3a0a507539 100644 --- a/arch/arm/mm/cache-v7m.S +++ b/arch/arm/mm/cache-v7m.S @@ -183,13 +183,13 @@ flush_levels: and r1, r1, #7 @ mask of the bits for current cache only cmp r1, #2 @ see what cache we have at this level blt skip @ skip if no cache, or just i-cache -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION save_and_disable_irqs_notrace r9 @ make cssr&csidr read atomic #endif write_csselr r10, r1 @ set current cache level isb @ isb to sych the new cssr&csidr read_ccsidr r1 @ read the new csidr -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION restore_irqs_notrace r9 #endif and r2, r1, #7 @ extract the length of the cache lines diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c index bd0f4821f7e11..865fd8fbfe59c 100644 --- a/arch/arm/mm/fault.c +++ b/arch/arm/mm/fault.c @@ -414,6 +414,9 @@ do_translation_fault(unsigned long addr, unsigned int fsr, if (addr < TASK_SIZE) return do_page_fault(addr, fsr, regs); + if (interrupts_enabled(regs)) + local_irq_enable(); + if (user_mode(regs)) goto bad_area; @@ -481,6 +484,9 @@ do_translation_fault(unsigned long addr, unsigned int fsr, static int do_sect_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) { + if (interrupts_enabled(regs)) + local_irq_enable(); + do_bad_area(addr, fsr, regs); return 0; } diff --git a/arch/arm/mm/highmem.c b/arch/arm/mm/highmem.c index a76f8ace9ce66..eb4a361d47d7b 100644 --- a/arch/arm/mm/highmem.c +++ b/arch/arm/mm/highmem.c @@ -31,6 +31,11 @@ static inline pte_t get_fixmap_pte(unsigned long vaddr) return *ptep; } +static unsigned int fixmap_idx(int type) +{ + return FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id(); +} + void *kmap(struct page *page) { might_sleep(); @@ -51,12 +56,13 @@ EXPORT_SYMBOL(kunmap); void *kmap_atomic(struct page *page) { + pte_t pte = mk_pte(page, kmap_prot); unsigned int idx; unsigned long vaddr; void *kmap; int type; - preempt_disable(); + preempt_disable_nort(); pagefault_disable(); if (!PageHighMem(page)) return page_address(page); @@ -76,7 +82,7 @@ void *kmap_atomic(struct page *page) type = kmap_atomic_idx_push(); - idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id(); + idx = fixmap_idx(type); vaddr = __fix_to_virt(idx); #ifdef CONFIG_DEBUG_HIGHMEM /* @@ -90,7 +96,10 @@ void *kmap_atomic(struct page *page) * in place, so the contained TLB flush ensures the TLB is updated * with the new mapping. */ - set_fixmap_pte(idx, mk_pte(page, kmap_prot)); +#ifdef CONFIG_PREEMPT_RT + current->kmap_pte[type] = pte; +#endif + set_fixmap_pte(idx, pte); return (void *)vaddr; } @@ -103,44 +112,75 @@ void __kunmap_atomic(void *kvaddr) if (kvaddr >= (void *)FIXADDR_START) { type = kmap_atomic_idx(); - idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id(); + idx = fixmap_idx(type); if (cache_is_vivt()) __cpuc_flush_dcache_area((void *)vaddr, PAGE_SIZE); +#ifdef CONFIG_PREEMPT_RT + current->kmap_pte[type] = __pte(0); +#endif #ifdef CONFIG_DEBUG_HIGHMEM BUG_ON(vaddr != __fix_to_virt(idx)); - set_fixmap_pte(idx, __pte(0)); #else (void) idx; /* to kill a warning */ #endif + set_fixmap_pte(idx, __pte(0)); kmap_atomic_idx_pop(); } else if (vaddr >= PKMAP_ADDR(0) && vaddr < PKMAP_ADDR(LAST_PKMAP)) { /* this address was obtained through kmap_high_get() */ kunmap_high(pte_page(pkmap_page_table[PKMAP_NR(vaddr)])); } pagefault_enable(); - preempt_enable(); + preempt_enable_nort(); } EXPORT_SYMBOL(__kunmap_atomic); void *kmap_atomic_pfn(unsigned long pfn) { + pte_t pte = pfn_pte(pfn, kmap_prot); unsigned long vaddr; int idx, type; struct page *page = pfn_to_page(pfn); - preempt_disable(); + preempt_disable_nort(); pagefault_disable(); if (!PageHighMem(page)) return page_address(page); type = kmap_atomic_idx_push(); - idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id(); + idx = fixmap_idx(type); vaddr = __fix_to_virt(idx); #ifdef CONFIG_DEBUG_HIGHMEM BUG_ON(!pte_none(get_fixmap_pte(vaddr))); #endif - set_fixmap_pte(idx, pfn_pte(pfn, kmap_prot)); +#ifdef CONFIG_PREEMPT_RT + current->kmap_pte[type] = pte; +#endif + set_fixmap_pte(idx, pte); return (void *)vaddr; } +#if defined CONFIG_PREEMPT_RT +void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p) +{ + int i; + + /* + * Clear @prev's kmap_atomic mappings + */ + for (i = 0; i < prev_p->kmap_idx; i++) { + int idx = fixmap_idx(i); + + set_fixmap_pte(idx, __pte(0)); + } + /* + * Restore @next_p's kmap_atomic mappings + */ + for (i = 0; i < next_p->kmap_idx; i++) { + int idx = fixmap_idx(i); + + if (!pte_none(next_p->kmap_pte[i])) + set_fixmap_pte(idx, next_p->kmap_pte[i]); + } +} +#endif diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 7f925361e51f3..d31752933f8cc 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -34,32 +34,32 @@ config ARM64 select ARCH_HAS_TEARDOWN_DMA_OPS if IOMMU_SUPPORT select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_HAVE_NMI_SAFE_CMPXCHG - select ARCH_INLINE_READ_LOCK if !PREEMPT - select ARCH_INLINE_READ_LOCK_BH if !PREEMPT - select ARCH_INLINE_READ_LOCK_IRQ if !PREEMPT - select ARCH_INLINE_READ_LOCK_IRQSAVE if !PREEMPT - select ARCH_INLINE_READ_UNLOCK if !PREEMPT - select ARCH_INLINE_READ_UNLOCK_BH if !PREEMPT - select ARCH_INLINE_READ_UNLOCK_IRQ if !PREEMPT - select ARCH_INLINE_READ_UNLOCK_IRQRESTORE if !PREEMPT - select ARCH_INLINE_WRITE_LOCK if !PREEMPT - select ARCH_INLINE_WRITE_LOCK_BH if !PREEMPT - select ARCH_INLINE_WRITE_LOCK_IRQ if !PREEMPT - select ARCH_INLINE_WRITE_LOCK_IRQSAVE if !PREEMPT - select ARCH_INLINE_WRITE_UNLOCK if !PREEMPT - select ARCH_INLINE_WRITE_UNLOCK_BH if !PREEMPT - select ARCH_INLINE_WRITE_UNLOCK_IRQ if !PREEMPT - select ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE if !PREEMPT - select ARCH_INLINE_SPIN_TRYLOCK if !PREEMPT - select ARCH_INLINE_SPIN_TRYLOCK_BH if !PREEMPT - select ARCH_INLINE_SPIN_LOCK if !PREEMPT - select ARCH_INLINE_SPIN_LOCK_BH if !PREEMPT - select ARCH_INLINE_SPIN_LOCK_IRQ if !PREEMPT - select ARCH_INLINE_SPIN_LOCK_IRQSAVE if !PREEMPT - select ARCH_INLINE_SPIN_UNLOCK if !PREEMPT - select ARCH_INLINE_SPIN_UNLOCK_BH if !PREEMPT - select ARCH_INLINE_SPIN_UNLOCK_IRQ if !PREEMPT - select ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE if !PREEMPT + select ARCH_INLINE_READ_LOCK if !PREEMPTION + select ARCH_INLINE_READ_LOCK_BH if !PREEMPTION + select ARCH_INLINE_READ_LOCK_IRQ if !PREEMPTION + select ARCH_INLINE_READ_LOCK_IRQSAVE if !PREEMPTION + select ARCH_INLINE_READ_UNLOCK if !PREEMPTION + select ARCH_INLINE_READ_UNLOCK_BH if !PREEMPTION + select ARCH_INLINE_READ_UNLOCK_IRQ if !PREEMPTION + select ARCH_INLINE_READ_UNLOCK_IRQRESTORE if !PREEMPTION + select ARCH_INLINE_WRITE_LOCK if !PREEMPTION + select ARCH_INLINE_WRITE_LOCK_BH if !PREEMPTION + select ARCH_INLINE_WRITE_LOCK_IRQ if !PREEMPTION + select ARCH_INLINE_WRITE_LOCK_IRQSAVE if !PREEMPTION + select ARCH_INLINE_WRITE_UNLOCK if !PREEMPTION + select ARCH_INLINE_WRITE_UNLOCK_BH if !PREEMPTION + select ARCH_INLINE_WRITE_UNLOCK_IRQ if !PREEMPTION + select ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE if !PREEMPTION + select ARCH_INLINE_SPIN_TRYLOCK if !PREEMPTION + select ARCH_INLINE_SPIN_TRYLOCK_BH if !PREEMPTION + select ARCH_INLINE_SPIN_LOCK if !PREEMPTION + select ARCH_INLINE_SPIN_LOCK_BH if !PREEMPTION + select ARCH_INLINE_SPIN_LOCK_IRQ if !PREEMPTION + select ARCH_INLINE_SPIN_LOCK_IRQSAVE if !PREEMPTION + select ARCH_INLINE_SPIN_UNLOCK if !PREEMPTION + select ARCH_INLINE_SPIN_UNLOCK_BH if !PREEMPTION + select ARCH_INLINE_SPIN_UNLOCK_IRQ if !PREEMPTION + select ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE if !PREEMPTION select ARCH_KEEP_MEMBLOCK select ARCH_USE_CMPXCHG_LOCKREF select ARCH_USE_QUEUED_RWLOCKS @@ -68,6 +68,7 @@ config ARM64 select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_SUPPORTS_INT128 if GCC_VERSION >= 50000 || CC_IS_CLANG select ARCH_SUPPORTS_NUMA_BALANCING + select ARCH_SUPPORTS_RT select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT select ARCH_WANT_FRAME_POINTERS @@ -158,6 +159,7 @@ config ARM64 select HAVE_PERF_EVENTS select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP + select HAVE_PREEMPT_LAZY select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_FUNCTION_ARG_ACCESS_API select HAVE_RCU_TABLE_FREE diff --git a/arch/arm64/crypto/sha256-glue.c b/arch/arm64/crypto/sha256-glue.c index e273faca924f9..999da59f03a9d 100644 --- a/arch/arm64/crypto/sha256-glue.c +++ b/arch/arm64/crypto/sha256-glue.c @@ -97,7 +97,7 @@ static int sha256_update_neon(struct shash_desc *desc, const u8 *data, * input when running on a preemptible kernel, but process the * data block by block instead. */ - if (IS_ENABLED(CONFIG_PREEMPT) && + if (IS_ENABLED(CONFIG_PREEMPTION) && chunk + sctx->count % SHA256_BLOCK_SIZE > SHA256_BLOCK_SIZE) chunk = SHA256_BLOCK_SIZE - sctx->count % SHA256_BLOCK_SIZE; diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index b8cf7c85ffa2a..2cc0dd8bd9f78 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -699,8 +699,8 @@ USER(\label, ic ivau, \tmp2) // invalidate I line PoU * where