From 64039117505a024c95924c48559e9139ec373d02 Mon Sep 17 00:00:00 2001 From: Cheng-mean Liu Date: Tue, 25 Apr 2017 22:21:37 -0700 Subject: [PATCH 2/2] HV_SOCK patch for 4.11 kernel --- MAINTAINERS | 2 + drivers/hv/channel.c | 4 +- drivers/hv/channel_mgmt.c | 26 ++----------- drivers/hv/vmbus_drv.c | 6 ++- include/linux/hyperv.h | 23 +++++++++-- include/linux/socket.h | 4 +- include/net/af_hvsock.h | 78 +++++++++++++++++++++++++++++++++++++ include/uapi/linux/hyperv.h | 23 +++++++++++ net/Kconfig | 1 + net/Makefile | 1 + security/selinux/hooks.c | 3 +- security/selinux/include/classmap.h | 2 +- 12 files changed, 141 insertions(+), 32 deletions(-) create mode 100644 include/net/af_hvsock.h diff --git a/MAINTAINERS b/MAINTAINERS index c776906..8c08ee7 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6058,7 +6058,9 @@ F: drivers/net/hyperv/ F: drivers/scsi/storvsc_drv.c F: drivers/uio/uio_hv_generic.c F: drivers/video/fbdev/hyperv_fb.c +F: net/hv_sock/ F: include/linux/hyperv.h +F: include/net/af_hvsock.h F: tools/hv/ F: Documentation/ABI/stable/sysfs-bus-vmbus diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c index bd0d198..f35d61f 100644 --- a/drivers/hv/channel.c +++ b/drivers/hv/channel.c @@ -538,7 +538,7 @@ static int vmbus_close_internal(struct vmbus_channel *channel) * To resolve the race, we can serialize them by disabling the * tasklet when the latter is running here. */ - hv_event_tasklet_disable(channel); + tasklet_disable(&channel->callback_event); /* * In case a device driver's probe() fails (e.g., @@ -605,8 +605,6 @@ static int vmbus_close_internal(struct vmbus_channel *channel) get_order(channel->ringbuffer_pagecount * PAGE_SIZE)); out: - hv_event_tasklet_enable(channel); - return ret; } diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c index f33465d..bf846d0 100644 --- a/drivers/hv/channel_mgmt.c +++ b/drivers/hv/channel_mgmt.c @@ -350,7 +350,8 @@ static struct vmbus_channel *alloc_channel(void) static void free_channel(struct vmbus_channel *channel) { tasklet_kill(&channel->callback_event); - kfree(channel); + + kfree_rcu(channel, rcu); } static void percpu_channel_enq(void *arg) @@ -359,14 +360,14 @@ static void percpu_channel_enq(void *arg) struct hv_per_cpu_context *hv_cpu = this_cpu_ptr(hv_context.cpu_context); - list_add_tail(&channel->percpu_list, &hv_cpu->chan_list); + list_add_tail_rcu(&channel->percpu_list, &hv_cpu->chan_list); } static void percpu_channel_deq(void *arg) { struct vmbus_channel *channel = arg; - list_del(&channel->percpu_list); + list_del_rcu(&channel->percpu_list); } @@ -381,19 +382,6 @@ static void vmbus_release_relid(u32 relid) true); } -void hv_event_tasklet_disable(struct vmbus_channel *channel) -{ - tasklet_disable(&channel->callback_event); -} - -void hv_event_tasklet_enable(struct vmbus_channel *channel) -{ - tasklet_enable(&channel->callback_event); - - /* In case there is any pending event */ - tasklet_schedule(&channel->callback_event); -} - void hv_process_channel_removal(struct vmbus_channel *channel, u32 relid) { unsigned long flags; @@ -402,7 +390,6 @@ void hv_process_channel_removal(struct vmbus_channel *channel, u32 relid) BUG_ON(!channel->rescind); BUG_ON(!mutex_is_locked(&vmbus_connection.channel_mutex)); - hv_event_tasklet_disable(channel); if (channel->target_cpu != get_cpu()) { put_cpu(); smp_call_function_single(channel->target_cpu, @@ -411,7 +398,6 @@ void hv_process_channel_removal(struct vmbus_channel *channel, u32 relid) percpu_channel_deq(channel); put_cpu(); } - hv_event_tasklet_enable(channel); if (channel->primary_channel == NULL) { list_del(&channel->listentry); @@ -505,7 +491,6 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel) init_vp_index(newchannel, dev_type); - hv_event_tasklet_disable(newchannel); if (newchannel->target_cpu != get_cpu()) { put_cpu(); smp_call_function_single(newchannel->target_cpu, @@ -515,7 +500,6 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel) percpu_channel_enq(newchannel); put_cpu(); } - hv_event_tasklet_enable(newchannel); /* * This state is used to indicate a successful open @@ -565,7 +549,6 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel) list_del(&newchannel->listentry); mutex_unlock(&vmbus_connection.channel_mutex); - hv_event_tasklet_disable(newchannel); if (newchannel->target_cpu != get_cpu()) { put_cpu(); smp_call_function_single(newchannel->target_cpu, @@ -574,7 +557,6 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel) percpu_channel_deq(newchannel); put_cpu(); } - hv_event_tasklet_enable(newchannel); vmbus_release_relid(newchannel->offermsg.child_relid); diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index da6b59b..8370b9d 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -939,8 +939,10 @@ static void vmbus_chan_sched(struct hv_per_cpu_context *hv_cpu) if (relid == 0) continue; + rcu_read_lock(); + /* Find channel based on relid */ - list_for_each_entry(channel, &hv_cpu->chan_list, percpu_list) { + list_for_each_entry_rcu(channel, &hv_cpu->chan_list, percpu_list) { if (channel->offermsg.child_relid != relid) continue; @@ -956,6 +958,8 @@ static void vmbus_chan_sched(struct hv_per_cpu_context *hv_cpu) tasklet_schedule(&channel->callback_event); } } + + rcu_read_unlock(); } } diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 62bbf3c..c5a37c7 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -845,6 +845,13 @@ struct vmbus_channel { * link up channels based on their CPU affinity. */ struct list_head percpu_list; + + /* + * Defer freeing channel until after all cpu's have + * gone through grace period. + */ + struct rcu_head rcu; + /* * For performance critical channels (storage, networking * etc,), Hyper-V has a mechanism to enhance the throughput @@ -1430,9 +1437,6 @@ extern bool vmbus_prep_negotiate_resp(struct icmsg_hdr *icmsghdrp, u8 *buf, const int *srv_version, int srv_vercnt, int *nego_fw_version, int *nego_srv_version); -void hv_event_tasklet_disable(struct vmbus_channel *channel); -void hv_event_tasklet_enable(struct vmbus_channel *channel); - void hv_process_channel_removal(struct vmbus_channel *channel, u32 relid); void vmbus_setevent(struct vmbus_channel *channel); @@ -1616,5 +1620,18 @@ static inline void commit_rd_index(struct vmbus_channel *channel) hv_signal_on_read(channel); } +struct vmpipe_proto_header { + u32 pkt_type; + u32 data_size; +}; + +#define HVSOCK_HEADER_LEN (sizeof(struct vmpacket_descriptor) + \ + sizeof(struct vmpipe_proto_header)) + +/* See 'prev_indices' in hv_ringbuffer_read(), hv_ringbuffer_write() */ +#define PREV_INDICES_LEN (sizeof(u64)) +#define HVSOCK_PKT_LEN(payload_len) (HVSOCK_HEADER_LEN + \ + ALIGN((payload_len), 8) + \ + PREV_INDICES_LEN) #endif /* _HYPERV_H */ diff --git a/include/linux/socket.h b/include/linux/socket.h index 0820274..f681af6 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -207,7 +207,8 @@ struct ucred { * reuses AF_INET address family */ -#define AF_MAX 44 /* For now.. */ +#define AF_HYPERV 44 /* Hyper-V Sockets */ +#define AF_MAX 45 /* For now.. */ /* Protocol families, same as address families. */ #define PF_UNSPEC AF_UNSPEC @@ -256,6 +257,7 @@ struct ucred { #define PF_KCM AF_KCM #define PF_QIPCRTR AF_QIPCRTR #define PF_SMC AF_SMC +#define PF_HYPERV AF_HYPERV #define PF_MAX AF_MAX /* Maximum queue length specifiable by listen. */ diff --git a/include/net/af_hvsock.h b/include/net/af_hvsock.h new file mode 100644 index 0000000..e7a8a3a --- /dev/null +++ b/include/net/af_hvsock.h @@ -0,0 +1,78 @@ +#ifndef __AF_HVSOCK_H__ +#define __AF_HVSOCK_H__ + +#include +#include +#include + +/* The host side's design of the feature requires 5 exact 4KB pages for + * recv/send rings respectively -- this is suboptimal considering memory + * consumption, however unluckily we have to live with it, before the + * host comes up with a better design in the future. + */ +#define PAGE_SIZE_4K 4096 +#define RINGBUFFER_HVSOCK_RCV_SIZE (PAGE_SIZE_4K * 5) +#define RINGBUFFER_HVSOCK_SND_SIZE (PAGE_SIZE_4K * 5) + +/* The MTU is 16KB per the host side's design. + * In future, the buffer can be elimiated when we switch to use the coming + * new VMBus ringbuffer "in-place consumption" APIs, by which we can + * directly copy data from VMBus ringbuffer into the userspace buffer. + */ +#define HVSOCK_MTU_SIZE (1024 * 16) +struct hvsock_recv_buf { + unsigned int data_len; + unsigned int data_offset; + + struct vmpipe_proto_header hdr; + u8 buf[HVSOCK_MTU_SIZE]; +}; + +/* In the VM, actually we can send up to HVSOCK_MTU_SIZE bytes of payload, + * but for now let's use a smaller size to minimize the dynamically-allocated + * buffer. Note: the buffer can be elimiated in future when we add new VMBus + * ringbuffer APIs that allow us to directly copy data from userspace buf to + * VMBus ringbuffer. + */ +#define HVSOCK_MAX_SND_SIZE_BY_VM (1024 * 4) +struct hvsock_send_buf { + struct vmpipe_proto_header hdr; + u8 buf[HVSOCK_MAX_SND_SIZE_BY_VM]; +}; + +struct hvsock_sock { + /* sk must be the first member. */ + struct sock sk; + + struct sockaddr_hv local_addr; + struct sockaddr_hv remote_addr; + + /* protected by the global hvsock_mutex */ + struct list_head bound_list; + struct list_head connected_list; + + struct list_head accept_queue; + /* used by enqueue and dequeue */ + struct mutex accept_queue_mutex; + + struct delayed_work dwork; + + u32 peer_shutdown; + + struct vmbus_channel *channel; + + struct hvsock_send_buf *send; + struct hvsock_recv_buf *recv; +}; + +static inline struct hvsock_sock *sk_to_hvsock(struct sock *sk) +{ + return (struct hvsock_sock *)sk; +} + +static inline struct sock *hvsock_to_sk(struct hvsock_sock *hvsk) +{ + return (struct sock *)hvsk; +} + +#endif /* __AF_HVSOCK_H__ */ diff --git a/include/uapi/linux/hyperv.h b/include/uapi/linux/hyperv.h index e347b24..eb3e44b 100644 --- a/include/uapi/linux/hyperv.h +++ b/include/uapi/linux/hyperv.h @@ -26,6 +26,7 @@ #define _UAPI_HYPERV_H #include +#include /* * Framework version for util services. @@ -396,4 +397,26 @@ struct hv_kvp_ip_msg { struct hv_kvp_ipaddr_value kvp_ip_val; } __attribute__((packed)); +/* This is the address format of Hyper-V Sockets. + * Note: here we just borrow the kernel's built-in type uuid_le. When + * an application calls bind() or connect(), the 2 members of struct + * sockaddr_hv must be of GUID. + * The GUID format differs from the UUID format only in the byte order of + * the first 3 fields. Refer to: + * https://en.wikipedia.org/wiki/Globally_unique_identifier + */ +struct sockaddr_hv { + __kernel_sa_family_t shv_family; /* Address family */ + u16 reserved; /* Must be Zero */ + uuid_le shv_vm_guid; /* VM ID */ + uuid_le shv_service_guid; /* Service ID */ +}; + +#define SHV_VMID_GUEST NULL_UUID_LE +#define SHV_VMID_HOST NULL_UUID_LE + +#define SHV_SERVICE_ID_ANY NULL_UUID_LE + +#define SHV_PROTO_RAW 1 + #endif /* _UAPI_HYPERV_H */ diff --git a/net/Kconfig b/net/Kconfig index 102f781..ba5a4a2 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -232,6 +232,7 @@ source "net/dns_resolver/Kconfig" source "net/batman-adv/Kconfig" source "net/openvswitch/Kconfig" source "net/vmw_vsock/Kconfig" +source "net/hv_sock/Kconfig" source "net/netlink/Kconfig" source "net/mpls/Kconfig" source "net/hsr/Kconfig" diff --git a/net/Makefile b/net/Makefile index 9b68155..26613a7 100644 --- a/net/Makefile +++ b/net/Makefile @@ -74,6 +74,7 @@ obj-$(CONFIG_PSAMPLE) += psample/ obj-$(CONFIG_NET_IFE) += ife/ obj-$(CONFIG_OPENVSWITCH) += openvswitch/ obj-$(CONFIG_VSOCKETS) += vmw_vsock/ +obj-$(CONFIG_HYPERV_SOCK) += hv_sock/ obj-$(CONFIG_MPLS) += mpls/ obj-$(CONFIG_HSR) += hsr/ ifneq ($(CONFIG_NET_SWITCHDEV),) diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 0c2ac31..4406621 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -1405,7 +1405,8 @@ static inline u16 socket_type_to_security_class(int family, int type, int protoc return SECCLASS_QIPCRTR_SOCKET; case PF_SMC: return SECCLASS_SMC_SOCKET; -#if PF_MAX > 44 + +#if PF_MAX > 45 #error New address family defined, please update this function. #endif } diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h index d429c4a..b7d653d 100644 --- a/security/selinux/include/classmap.h +++ b/security/selinux/include/classmap.h @@ -234,6 +234,6 @@ struct security_class_mapping secclass_map[] = { { NULL } }; -#if PF_MAX > 44 +#if PF_MAX > 45 #error New address family defined, please update secclass_map. #endif -- 2.7.4