diff options
author | nbd <nbd@3c298f89-4303-0410-b956-a3cf2f4a3e73> | 2005-10-22 22:03:56 +0000 |
---|---|---|
committer | nbd <nbd@3c298f89-4303-0410-b956-a3cf2f4a3e73> | 2005-10-22 22:03:56 +0000 |
commit | 30bf34f05c0f66d9d9221f7ebfe9808024337dd4 (patch) | |
tree | bfaeed0ba28cc75ed562c61a7516fba097e78e13 /target/linux/linux-2.6/patches/generic | |
parent | 2be847dc58145c4946781ebde7cda49ce0e5f099 (diff) |
add pf_ring patches for kernel and libpcap
git-svn-id: svn://svn.openwrt.org/openwrt/trunk/openwrt@2266 3c298f89-4303-0410-b956-a3cf2f4a3e73
Diffstat (limited to 'target/linux/linux-2.6/patches/generic')
-rw-r--r-- | target/linux/linux-2.6/patches/generic/104-pf_ring.patch | 5299 |
1 files changed, 5299 insertions, 0 deletions
diff --git a/target/linux/linux-2.6/patches/generic/104-pf_ring.patch b/target/linux/linux-2.6/patches/generic/104-pf_ring.patch new file mode 100644 index 000000000..759fb2cc9 --- /dev/null +++ b/target/linux/linux-2.6/patches/generic/104-pf_ring.patch @@ -0,0 +1,5299 @@ +diff --unified --recursive --new-file linux-2.6.12.5/include/linux/ring.h linux-2.6.12.5-1-686-smp-ring3/include/linux/ring.h +--- linux-2.6.12.5/include/linux/ring.h 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.12.5-1-686-smp-ring3/include/linux/ring.h 2005-10-22 23:50:44.951445250 +0200 +@@ -0,0 +1,108 @@ ++/* ++ * Definitions for packet ring ++ * ++ * 2004 - Luca Deri <deri@ntop.org> ++ */ ++#ifndef __RING_H ++#define __RING_H ++ ++ ++#define INCLUDE_MAC_INFO ++ ++#ifdef INCLUDE_MAC_INFO ++#define SKB_DISPLACEMENT 14 /* Include MAC address information */ ++#else ++#define SKB_DISPLACEMENT 0 /* Do NOT include MAC address information */ ++#endif ++ ++#define RING_MAGIC ++#define RING_MAGIC_VALUE 0x88 ++#define RING_FLOWSLOT_VERSION 5 ++#define RING_VERSION "3.0" ++ ++#define SO_ADD_TO_CLUSTER 99 ++#define SO_REMOVE_FROM_CLUSTER 100 ++#define SO_SET_REFLECTOR 101 ++ ++/* *********************************** */ ++ ++#ifndef HAVE_PCAP ++struct pcap_pkthdr { ++ struct timeval ts; /* time stamp */ ++ u_int32_t caplen; /* length of portion present */ ++ u_int32_t len; /* length this packet (off wire) */ ++}; ++#endif ++ ++/* *********************************** */ ++ ++enum cluster_type { ++ cluster_per_flow = 0, ++ cluster_round_robin ++}; ++ ++/* *********************************** */ ++ ++#define RING_MIN_SLOT_SIZE (60+sizeof(struct pcap_pkthdr)) ++#define RING_MAX_SLOT_SIZE (1514+sizeof(struct pcap_pkthdr)) ++ ++/* *********************************** */ ++ ++typedef struct flowSlotInfo { ++ u_int16_t version, sample_rate; ++ u_int32_t tot_slots, slot_len, tot_mem; ++ ++ u_int64_t tot_pkts, tot_lost; ++ u_int64_t tot_insert, tot_read; ++ u_int16_t insert_idx; ++ u_int16_t remove_idx; ++} FlowSlotInfo; ++ ++/* *********************************** */ ++ ++typedef struct flowSlot { ++#ifdef RING_MAGIC ++ u_char magic; /* It must alwasy be zero */ ++#endif ++ u_char slot_state; /* 0=empty, 1=full */ ++ u_char bucket; /* bucket[bucketLen] */ ++} FlowSlot; ++ ++/* *********************************** */ ++ ++#ifdef __KERNEL__ ++ ++FlowSlotInfo* getRingPtr(void); ++int allocateRing(char *deviceName, u_int numSlots, ++ u_int bucketLen, u_int sampleRate); ++unsigned int pollRing(struct file *fp, struct poll_table_struct * wait); ++void deallocateRing(void); ++ ++/* ************************* */ ++ ++typedef int (*handle_ring_skb)(struct sk_buff *skb, ++ u_char recv_packet, u_char real_skb); ++extern handle_ring_skb get_skb_ring_handler(void); ++extern void set_skb_ring_handler(handle_ring_skb the_handler); ++extern void do_skb_ring_handler(struct sk_buff *skb, ++ u_char recv_packet, u_char real_skb); ++ ++typedef int (*handle_ring_buffer)(struct net_device *dev, ++ char *data, int len); ++extern handle_ring_buffer get_buffer_ring_handler(void); ++extern void set_buffer_ring_handler(handle_ring_buffer the_handler); ++extern int do_buffer_ring_handler(struct net_device *dev, ++ char *data, int len); ++#endif /* __KERNEL__ */ ++ ++/* *********************************** */ ++ ++#define PF_RING 27 /* Packet Ring */ ++#define SOCK_RING PF_RING ++ ++/* ioctl() */ ++#define SIORINGPOLL 0x8888 ++ ++/* *********************************** */ ++ ++#endif /* __RING_H */ +diff --unified --recursive --new-file linux-2.6.12.5/net/Kconfig linux-2.6.12.5-1-686-smp-ring3/net/Kconfig +--- linux-2.6.12.5/net/Kconfig 2005-08-15 02:20:18.000000000 +0200 ++++ linux-2.6.12.5-1-686-smp-ring3/net/Kconfig 2005-10-22 23:50:45.535481750 +0200 +@@ -72,6 +72,7 @@ + + Say Y unless you know what you are doing. + ++source "net/ring/Kconfig" + config INET + bool "TCP/IP networking" + ---help--- +diff --unified --recursive --new-file linux-2.6.12.5/net/Makefile linux-2.6.12.5-1-686-smp-ring3/net/Makefile +--- linux-2.6.12.5/net/Makefile 2005-08-15 02:20:18.000000000 +0200 ++++ linux-2.6.12.5-1-686-smp-ring3/net/Makefile 2005-10-22 23:50:45.491479000 +0200 +@@ -41,6 +41,7 @@ + obj-$(CONFIG_DECNET) += decnet/ + obj-$(CONFIG_ECONET) += econet/ + obj-$(CONFIG_VLAN_8021Q) += 8021q/ ++obj-$(CONFIG_RING) += ring/ + obj-$(CONFIG_IP_SCTP) += sctp/ + + ifeq ($(CONFIG_NET),y) +diff --unified --recursive --new-file linux-2.6.12.5/net/Makefile.ORG linux-2.6.12.5-1-686-smp-ring3/net/Makefile.ORG +--- linux-2.6.12.5/net/Makefile.ORG 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.12.5-1-686-smp-ring3/net/Makefile.ORG 2005-10-22 23:50:45.483478500 +0200 +@@ -0,0 +1,48 @@ ++# ++# Makefile for the linux networking. ++# ++# 2 Sep 2000, Christoph Hellwig <hch@infradead.org> ++# Rewritten to use lists instead of if-statements. ++# ++ ++obj-y := nonet.o ++ ++obj-$(CONFIG_NET) := socket.o core/ ++ ++tmp-$(CONFIG_COMPAT) := compat.o ++obj-$(CONFIG_NET) += $(tmp-y) ++ ++# LLC has to be linked before the files in net/802/ ++obj-$(CONFIG_LLC) += llc/ ++obj-$(CONFIG_NET) += ethernet/ 802/ sched/ netlink/ ++obj-$(CONFIG_INET) += ipv4/ ++obj-$(CONFIG_XFRM) += xfrm/ ++obj-$(CONFIG_UNIX) += unix/ ++ifneq ($(CONFIG_IPV6),) ++obj-y += ipv6/ ++endif ++obj-$(CONFIG_PACKET) += packet/ ++obj-$(CONFIG_NET_KEY) += key/ ++obj-$(CONFIG_NET_SCHED) += sched/ ++obj-$(CONFIG_BRIDGE) += bridge/ ++obj-$(CONFIG_IPX) += ipx/ ++obj-$(CONFIG_ATALK) += appletalk/ ++obj-$(CONFIG_WAN_ROUTER) += wanrouter/ ++obj-$(CONFIG_X25) += x25/ ++obj-$(CONFIG_LAPB) += lapb/ ++obj-$(CONFIG_NETROM) += netrom/ ++obj-$(CONFIG_ROSE) += rose/ ++obj-$(CONFIG_AX25) += ax25/ ++obj-$(CONFIG_IRDA) += irda/ ++obj-$(CONFIG_BT) += bluetooth/ ++obj-$(CONFIG_SUNRPC) += sunrpc/ ++obj-$(CONFIG_RXRPC) += rxrpc/ ++obj-$(CONFIG_ATM) += atm/ ++obj-$(CONFIG_DECNET) += decnet/ ++obj-$(CONFIG_ECONET) += econet/ ++obj-$(CONFIG_VLAN_8021Q) += 8021q/ ++obj-$(CONFIG_IP_SCTP) += sctp/ ++ ++ifeq ($(CONFIG_NET),y) ++obj-$(CONFIG_SYSCTL) += sysctl_net.o ++endif +diff --unified --recursive --new-file linux-2.6.12.5/net/core/dev.c linux-2.6.12.5-1-686-smp-ring3/net/core/dev.c +--- linux-2.6.12.5/net/core/dev.c 2005-08-15 02:20:18.000000000 +0200 ++++ linux-2.6.12.5-1-686-smp-ring3/net/core/dev.c 2005-10-22 23:50:45.479478250 +0200 +@@ -115,6 +115,56 @@ + #endif /* CONFIG_NET_RADIO */ + #include <asm/current.h> + ++#if defined (CONFIG_RING) || defined(CONFIG_RING_MODULE) ++ ++/* #define RING_DEBUG */ ++ ++#include <linux/ring.h> ++#include <linux/version.h> ++ ++static handle_ring_skb ring_handler = NULL; ++ ++handle_ring_skb get_skb_ring_handler() { return(ring_handler); } ++ ++void set_skb_ring_handler(handle_ring_skb the_handler) { ++ ring_handler = the_handler; ++} ++ ++void do_skb_ring_handler(struct sk_buff *skb, ++ u_char recv_packet, u_char real_skb) { ++ if(ring_handler) ++ ring_handler(skb, recv_packet, real_skb); ++} ++ ++/* ******************* */ ++ ++static handle_ring_buffer buffer_ring_handler = NULL; ++ ++handle_ring_buffer get_buffer_ring_handler() { return(buffer_ring_handler); } ++ ++void set_buffer_ring_handler(handle_ring_buffer the_handler) { ++ buffer_ring_handler = the_handler; ++} ++ ++int do_buffer_ring_handler(struct net_device *dev, char *data, int len) { ++ if(buffer_ring_handler) { ++ buffer_ring_handler(dev, data, len); ++ return(1); ++ } else ++ return(0); ++} ++ ++#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)) ++EXPORT_SYMBOL(get_skb_ring_handler); ++EXPORT_SYMBOL(set_skb_ring_handler); ++EXPORT_SYMBOL(do_skb_ring_handler); ++ ++EXPORT_SYMBOL(get_buffer_ring_handler); ++EXPORT_SYMBOL(set_buffer_ring_handler); ++EXPORT_SYMBOL(do_buffer_ring_handler); ++#endif ++ ++#endif + /* This define, if set, will randomly drop a packet when congestion + * is more than moderate. It helps fairness in the multi-interface + * case when one of them is a hog, but it kills performance for the +@@ -1293,6 +1343,10 @@ + skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_EGRESS); + #endif + if (q->enqueue) { ++#if defined (CONFIG_RING) || defined(CONFIG_RING_MODULE) ++ if(ring_handler) ring_handler(skb, 0, 1); ++#endif /* CONFIG_RING */ ++ + /* Grab device queue */ + spin_lock(&dev->queue_lock); + +@@ -1509,6 +1563,13 @@ + + preempt_disable(); + err = netif_rx(skb); ++#if defined (CONFIG_RING) || defined(CONFIG_RING_MODULE) ++ if(ring_handler && ring_handler(skb, 1, 1)) { ++ /* The packet has been copied into a ring */ ++ return(NET_RX_SUCCESS); ++ } ++#endif /* CONFIG_RING */ ++ + if (local_softirq_pending()) + do_softirq(); + preempt_enable(); +@@ -1655,6 +1716,13 @@ + int ret = NET_RX_DROP; + unsigned short type; + ++#if defined (CONFIG_RING) || defined(CONFIG_RING_MODULE) ++ if(ring_handler && ring_handler(skb, 1, 1)) { ++ /* The packet has been copied into a ring */ ++ return(NET_RX_SUCCESS); ++ } ++#endif /* CONFIG_RING */ ++ + /* if we've gotten here through NAPI, check netpoll */ + if (skb->dev->poll && netpoll_rx(skb)) + return NET_RX_DROP; +diff --unified --recursive --new-file linux-2.6.12.5/net/core/dev.c.ORG linux-2.6.12.5-1-686-smp-ring3/net/core/dev.c.ORG +--- linux-2.6.12.5/net/core/dev.c.ORG 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.12.5-1-686-smp-ring3/net/core/dev.c.ORG 2005-10-22 23:50:45.203461000 +0200 +@@ -0,0 +1,3385 @@ ++/* ++ * NET3 Protocol independent device support routines. ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License ++ * as published by the Free Software Foundation; either version ++ * 2 of the License, or (at your option) any later version. ++ * ++ * Derived from the non IP parts of dev.c 1.0.19 ++ * Authors: Ross Biro ++ * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> ++ * Mark Evans, <evansmp@uhura.aston.ac.uk> ++ * ++ * Additional Authors: ++ * Florian la Roche <rzsfl@rz.uni-sb.de> ++ * Alan Cox <gw4pts@gw4pts.ampr.org> ++ * David Hinds <dahinds@users.sourceforge.net> ++ * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> ++ * Adam Sulmicki <adam@cfar.umd.edu> ++ * Pekka Riikonen <priikone@poesidon.pspt.fi> ++ * ++ * Changes: ++ * D.J. Barrow : Fixed bug where dev->refcnt gets set ++ * to 2 if register_netdev gets called ++ * before net_dev_init & also removed a ++ * few lines of code in the process. ++ * Alan Cox : device private ioctl copies fields back. ++ * Alan Cox : Transmit queue code does relevant ++ * stunts to keep the queue safe. ++ * Alan Cox : Fixed double lock. ++ * Alan Cox : Fixed promisc NULL pointer trap ++ * ???????? : Support the full private ioctl range ++ * Alan Cox : Moved ioctl permission check into ++ * drivers ++ * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI ++ * Alan Cox : 100 backlog just doesn't cut it when ++ * you start doing multicast video 8) ++ * Alan Cox : Rewrote net_bh and list manager. ++ * Alan Cox : Fix ETH_P_ALL echoback lengths. ++ * Alan Cox : Took out transmit every packet pass ++ * Saved a few bytes in the ioctl handler ++ * Alan Cox : Network driver sets packet type before ++ * calling netif_rx. Saves a function ++ * call a packet. ++ * Alan Cox : Hashed net_bh() ++ * Richard Kooijman: Timestamp fixes. ++ * Alan Cox : Wrong field in SIOCGIFDSTADDR ++ * Alan Cox : Device lock protection. ++ * Alan Cox : Fixed nasty side effect of device close ++ * changes. ++ * Rudi Cilibrasi : Pass the right thing to ++ * set_mac_address() ++ * Dave Miller : 32bit quantity for the device lock to ++ * make it work out on a Sparc. ++ * Bjorn Ekwall : Added KERNELD hack. ++ * Alan Cox : Cleaned up the backlog initialise. ++ * Craig Metz : SIOCGIFCONF fix if space for under ++ * 1 device. ++ * Thomas Bogendoerfer : Return ENODEV for dev_open, if there ++ * is no device open function. ++ * Andi Kleen : Fix error reporting for SIOCGIFCONF ++ * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF ++ * Cyrus Durgin : Cleaned for KMOD ++ * Adam Sulmicki : Bug Fix : Network Device Unload ++ * A network device unload needs to purge ++ * the backlog queue. ++ * Paul Rusty Russell : SIOCSIFNAME ++ * Pekka Riikonen : Netdev boot-time settings code ++ * Andrew Morton : Make unregister_netdevice wait ++ * indefinitely on dev->refcnt ++ * J Hadi Salim : - Backlog queue sampling ++ * - netif_rx() feedback ++ */ ++ ++#include <asm/uaccess.h> ++#include <asm/system.h> ++#include <linux/bitops.h> ++#include <linux/config.h> ++#include <linux/cpu.h> ++#include <linux/types.h> ++#include <linux/kernel.h> ++#include <linux/sched.h> ++#include <linux/string.h> ++#include <linux/mm.h> ++#include <linux/socket.h> ++#include <linux/sockios.h> ++#include <linux/errno.h> ++#include <linux/interrupt.h> ++#include <linux/if_ether.h> ++#include <linux/netdevice.h> ++#include <linux/etherdevice.h> ++#include <linux/notifier.h> ++#include <linux/skbuff.h> ++#include <net/sock.h> ++#include <linux/rtnetlink.h> ++#include <linux/proc_fs.h> ++#include <linux/seq_file.h> ++#include <linux/stat.h> ++#include <linux/if_bridge.h> ++#include <linux/divert.h> ++#include <net/dst.h> ++#include <net/pkt_sched.h> ++#include <net/checksum.h> ++#include <linux/highmem.h> ++#include <linux/init.h> ++#include <linux/kmod.h> ++#include <linux/module.h> ++#include <linux/kallsyms.h> ++#include <linux/netpoll.h> ++#include <linux/rcupdate.h> ++#include <linux/delay.h> ++#ifdef CONFIG_NET_RADIO ++#include <linux/wireless.h> /* Note : will define WIRELESS_EXT */ ++#include <net/iw_handler.h> ++#endif /* CONFIG_NET_RADIO */ ++#include <asm/current.h> ++ ++/* This define, if set, will randomly drop a packet when congestion ++ * is more than moderate. It helps fairness in the multi-interface ++ * case when one of them is a hog, but it kills performance for the ++ * single interface case so it is off now by default. ++ */ ++#undef RAND_LIE ++ ++/* Setting this will sample the queue lengths and thus congestion ++ * via a timer instead of as each packet is received. ++ */ ++#undef OFFLINE_SAMPLE ++ ++/* ++ * The list of packet types we will receive (as opposed to discard) ++ * and the routines to invoke. ++ * ++ * Why 16. Because with 16 the only overlap we get on a hash of the ++ * low nibble of the protocol value is RARP/SNAP/X.25. ++ * ++ * NOTE: That is no longer true with the addition of VLAN tags. Not ++ * sure which should go first, but I bet it won't make much ++ * difference if we are running VLANs. The good news is that ++ * this protocol won't be in the list unless compiled in, so ++ * the average user (w/out VLANs) will not be adversly affected. ++ * --BLG ++ * ++ * 0800 IP ++ * 8100 802.1Q VLAN ++ * 0001 802.3 ++ * 0002 AX.25 ++ * 0004 802.2 ++ * 8035 RARP ++ * 0005 SNAP ++ * 0805 X.25 ++ * 0806 ARP ++ * 8137 IPX ++ * 0009 Localtalk ++ * 86DD IPv6 ++ */ ++ ++static DEFINE_SPINLOCK(ptype_lock); ++static struct list_head ptype_base[16]; /* 16 way hashed list */ ++static struct list_head ptype_all; /* Taps */ ++ ++#ifdef OFFLINE_SAMPLE ++static void sample_queue(unsigned long dummy); ++static struct timer_list samp_timer = TIMER_INITIALIZER(sample_queue, 0, 0); ++#endif ++ ++/* ++ * The @dev_base list is protected by @dev_base_lock and the rtln ++ * semaphore. ++ * ++ * Pure readers hold dev_base_lock for reading. ++ * ++ * Writers must hold the rtnl semaphore while they loop through the ++ * dev_base list, and hold dev_base_lock for writing when they do the ++ * actual updates. This allows pure readers to access the list even ++ * while a writer is preparing to update it. ++ * ++ * To put it another way, dev_base_lock is held for writing only to ++ * protect against pure readers; the rtnl semaphore provides the ++ * protection against other writers. ++ * ++ * See, for example usages, register_netdevice() and ++ * unregister_netdevice(), which must be called with the rtnl ++ * semaphore held. ++ */ ++struct net_device *dev_base; ++static struct net_device **dev_tail = &dev_base; ++DEFINE_RWLOCK(dev_base_lock); ++ ++EXPORT_SYMBOL(dev_base); ++EXPORT_SYMBOL(dev_base_lock); ++ ++#define NETDEV_HASHBITS 8 ++static struct hlist_head dev_name_head[1<<NETDEV_HASHBITS]; ++static struct hlist_head dev_index_head[1<<NETDEV_HASHBITS]; ++ ++static inline struct hlist_head *dev_name_hash(const char *name) ++{ ++ unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ)); ++ return &dev_name_head[hash & ((1<<NETDEV_HASHBITS)-1)]; ++} ++ ++static inline struct hlist_head *dev_index_hash(int ifindex) ++{ ++ return &dev_index_head[ifindex & ((1<<NETDEV_HASHBITS)-1)]; ++} ++ ++/* ++ * Our notifier list ++ */ ++ ++static struct notifier_block *netdev_chain; ++ ++/* ++ * Device drivers call our routines to queue packets here. We empty the ++ * queue in the local softnet handler. ++ */ ++DEFINE_PER_CPU(struct softnet_data, softnet_data) = { 0, }; ++ ++#ifdef CONFIG_SYSFS ++extern int netdev_sysfs_init(void); ++extern int netdev_register_sysfs(struct net_device *); ++extern void netdev_unregister_sysfs(struct net_device *); ++#else ++#define netdev_sysfs_init() (0) ++#define netdev_register_sysfs(dev) (0) ++#define netdev_unregister_sysfs(dev) do { } while(0) ++#endif ++ ++ ++/******************************************************************************* ++ ++ Protocol management and registration routines ++ ++*******************************************************************************/ ++ ++/* ++ * For efficiency ++ */ ++ ++int netdev_nit; ++ ++/* ++ * Add a protocol ID to the list. Now that the input handler is ++ * smarter we can dispense with all the messy stuff that used to be ++ * here. ++ * ++ * BEWARE!!! Protocol handlers, mangling input packets, ++ * MUST BE last in hash buckets and checking protocol handlers ++ * MUST start from promiscuous ptype_all chain in net_bh. ++ * It is true now, do not change it. ++ * Explanation follows: if protocol handler, mangling packet, will ++ * be the first on list, it is not able to sense, that packet ++ * is cloned and should be copied-on-write, so that it will ++ * change it and subsequent readers will get broken packet. ++ * --ANK (980803) ++ */ ++ ++/** ++ * dev_add_pack - add packet handler ++ * @pt: packet type declaration ++ * ++ * Add a protocol handler to the networking stack. The passed &packet_type ++ * is linked into kernel lists and may not be freed until it has been ++ * removed from the kernel lists. ++ * ++ * This call does not sleep therefore it can not ++ * guarantee all CPU's that are in middle of receiving packets ++ * will see the new packet type (until the next received packet). ++ */ ++ ++void dev_add_pack(struct packet_type *pt) ++{ ++ int hash; ++ ++ spin_lock_bh(&ptype_lock); ++ if (pt->type == htons(ETH_P_ALL)) { ++ netdev_nit++; ++ list_add_rcu(&pt->list, &ptype_all); ++ } else { ++ hash = ntohs(pt->type) & 15; ++ list_add_rcu(&pt->list, &ptype_base[hash]); ++ } ++ spin_unlock_bh(&ptype_lock); ++} ++ ++extern void linkwatch_run_queue(void); ++ ++ ++ ++/** ++ * __dev_remove_pack - remove packet handler ++ * @pt: packet type declaration ++ * ++ * Remove a protocol handler that was previously added to the kernel ++ * protocol handlers by dev_add_pack(). The passed &packet_type is removed ++ * from the kernel lists and can be freed or reused once this function ++ * returns. ++ * ++ * The packet type might still be in use by receivers ++ * and must not be freed until after all the CPU's have gone ++ * through a quiescent state. ++ */ ++void __dev_remove_pack(struct packet_type *pt) ++{ ++ struct list_head *head; ++ struct packet_type *pt1; ++ ++ spin_lock_bh(&ptype_lock); ++ ++ if (pt->type == htons(ETH_P_ALL)) { ++ netdev_nit--; ++ head = &ptype_all; ++ } else ++ head = &ptype_base[ntohs(pt->type) & 15]; ++ ++ list_for_each_entry(pt1, head, list) { ++ if (pt == pt1) { ++ list_del_rcu(&pt->list); ++ goto out; ++ } ++ } ++ ++ printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt); ++out: ++ spin_unlock_bh(&ptype_lock); ++} ++/** ++ * dev_remove_pack - remove packet handler ++ * @pt: packet type declaration ++ * ++ * Remove a protocol handler that was previously added to the kernel ++ * protocol handlers by dev_add_pack(). The passed &packet_type is removed ++ * from the kernel lists and can be freed or reused once this function ++ * returns. ++ * ++ * This call sleeps to guarantee that no CPU is looking at the packet ++ * type after return. ++ */ ++void dev_remove_pack(struct packet_type *pt) ++{ ++ __dev_remove_pack(pt); ++ ++ synchronize_net(); ++} ++ ++/****************************************************************************** ++ ++ Device Boot-time Settings Routines ++ ++*******************************************************************************/ ++ ++/* Boot time configuration table */ ++static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX]; ++ ++/** ++ * netdev_boot_setup_add - add new setup entry ++ * @name: name of the device ++ * @map: configured settings for the device ++ * ++ * Adds new setup entry to the dev_boot_setup list. The function ++ * returns 0 on error and 1 on success. This is a generic routine to ++ * all netdevices. ++ */ ++static int netdev_boot_setup_add(char *name, struct ifmap *map) ++{ ++ struct netdev_boot_setup *s; ++ int i; ++ ++ s = dev_boot_setup; ++ for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) { ++ if (s[i].name[0] == '\0' || s[i].name[0] == ' ') { ++ memset(s[i].name, 0, sizeof(s[i].name)); ++ strcpy(s[i].name, name); ++ memcpy(&s[i].map, map, sizeof(s[i].map)); ++ break; ++ } ++ } ++ ++ return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1; ++} ++ ++/** ++ * netdev_boot_setup_check - check boot time settings ++ * @dev: the netdevice ++ * ++ * Check boot time settings for the device. ++ * The found settings are set for the device to be used ++ * later in the device probing. ++ * Returns 0 if no settings found, 1 if they are. ++ */ ++int netdev_boot_setup_check(struct net_device *dev) ++{ ++ struct netdev_boot_setup *s = dev_boot_setup; ++ int i; ++ ++ for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) { ++ if (s[i].name[0] != '\0' && s[i].name[0] != ' ' && ++ !strncmp(dev->name, s[i].name, strlen(s[i].name))) { ++ dev->irq = s[i].map.irq; ++ dev->base_addr = s[i].map.base_addr; ++ dev->mem_start = s[i].map.mem_start; ++ dev->mem_end = s[i].map.mem_end; ++ return 1; ++ } ++ } ++ return 0; ++} ++ ++ ++/** ++ * netdev_boot_base - get address from boot time settings ++ * @prefix: prefix for network device ++ * @unit: id for network device ++ * ++ * Check boot time settings for the base address of device. ++ * The found settings are set for the device to be used ++ * later in the device probing. ++ * Returns 0 if no settings found. ++ */ ++unsigned long netdev_boot_base(const char *prefix, int unit) ++{ ++ const struct netdev_boot_setup *s = dev_boot_setup; ++ char name[IFNAMSIZ]; ++ int i; ++ ++ sprintf(name, "%s%d", prefix, unit); ++ ++ /* ++ * If device already registered then return base of 1 ++ * to indicate not to probe for this interface ++ */ ++ if (__dev_get_by_name(name)) ++ return 1; ++ ++ for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) ++ if (!strcmp(name, s[i].name)) ++ return s[i].map.base_addr; ++ return 0; ++} ++ ++/* ++ * Saves at boot time configured settings for any netdevice. ++ */ ++int __init netdev_boot_setup(char *str) ++{ ++ int ints[5]; ++ struct ifmap map; ++ ++ str = get_options(str, ARRAY_SIZE(ints), ints); ++ if (!str || !*str) ++ return 0; ++ ++ /* Save settings */ ++ memset(&map, 0, sizeof(map)); ++ if (ints[0] > 0) ++ map.irq = ints[1]; ++ if (ints[0] > 1) ++ map.base_addr = ints[2]; ++ if (ints[0] > 2) ++ map.mem_start = ints[3]; ++ if (ints[0] > 3) ++ map.mem_end = ints[4]; ++ ++ /* Add new entry to the list */ ++ return netdev_boot_setup_add(str, &map); ++} ++ ++__setup("netdev=", netdev_boot_setup); ++ ++/******************************************************************************* ++ ++ Device Interface Subroutines ++ ++*******************************************************************************/ ++ ++/** ++ * __dev_get_by_name - find a device by its name ++ * @name: name to find ++ * ++ * Find an interface by name. Must be called under RTNL semaphore ++ * or @dev_base_lock. If the name is found a pointer to the device ++ * is returned. If the name is not found then %NULL is returned. The ++ * reference counters are not incremented so the caller must be ++ * careful with locks. ++ */ ++ ++struct net_device *__dev_get_by_name(const char *name) ++{ ++ struct hlist_node *p; ++ ++ hlist_for_each(p, dev_name_hash(name)) { ++ struct net_device *dev ++ = hlist_entry(p, struct net_device, name_hlist); ++ if (!strncmp(dev->name, name, IFNAMSIZ)) ++ return dev; ++ } ++ return NULL; ++} ++ ++/** ++ * dev_get_by_name - find a device by its name ++ * @name: name to find ++ * ++ * Find an interface by name. This can be called from any ++ * context and does its own locking. The returned handle has ++ * the usage count incremented and the caller must use dev_put() to ++ * release it when it is no longer needed. %NULL is returned if no ++ * matching device is found. ++ */ ++ ++struct net_device *dev_get_by_name(const char *name) ++{ ++ struct net_device *dev; ++ ++ read_lock(&dev_base_lock); ++ dev = __dev_get_by_name(name); ++ if (dev) ++ dev_hold(dev); ++ read_unlock(&dev_base_lock); ++ return dev; ++} ++ ++/** ++ * __dev_get_by_index - find a device by its ifindex ++ * @ifindex: index of device ++ * ++ * Search for an interface by index. Returns %NULL if the device ++ * is not found or a pointer to the device. The device has not ++ * had its reference counter increased so the caller must be careful ++ * about locking. The caller must hold either the RTNL semaphore ++ * or @dev_base_lock. ++ */ ++ ++struct net_device *__dev_get_by_index(int ifindex) ++{ ++ struct hlist_node *p; ++ ++ hlist_for_each(p, dev_index_hash(ifindex)) { ++ struct net_device *dev ++ = hlist_entry(p, struct net_device, index_hlist); ++ if (dev->ifindex == ifindex) ++ return dev; ++ } ++ return NULL; ++} ++ ++ ++/** ++ * dev_get_by_index - find a device by its ifindex ++ * @ifindex: index of device ++ * ++ * Search for an interface by index. Returns NULL if the device ++ * is not found or a pointer to the device. The device returned has ++ * had a reference added and the pointer is safe until the user calls ++ * dev_put to indicate they have finished with it. ++ */ ++ ++struct net_device *dev_get_by_index(int ifindex) ++{ ++ struct net_device *dev; ++ ++ read_lock(&dev_base_lock); ++ dev = __dev_get_by_index(ifindex); ++ if (dev) ++ dev_hold(dev); ++ read_unlock(&dev_base_lock); ++ return dev; ++} ++ ++/** ++ * dev_getbyhwaddr - find a device by its hardware address ++ * @type: media type of device ++ * @ha: hardware address ++ * ++ * Search for an interface by MAC address. Returns NULL if the device ++ * is not found or a pointer to the device. The caller must hold the ++ * rtnl semaphore. The returned device has not had its ref count increased ++ * and the caller must therefore be careful about locking ++ * ++ * BUGS: ++ * If the API was consistent this would be __dev_get_by_hwaddr ++ */ ++ ++struct net_device *dev_getbyhwaddr(unsigned short type, char *ha) ++{ ++ struct net_device *dev; ++ ++ ASSERT_RTNL(); ++ ++ for (dev = dev_base; dev; dev = dev->next) ++ if (dev->type == type && ++ !memcmp(dev->dev_addr, ha, dev->addr_len)) ++ break; ++ return dev; ++} ++ ++struct net_device *dev_getfirstbyhwtype(unsigned short type) ++{ ++ struct net_device *dev; ++ ++ rtnl_lock(); ++ for (dev = dev_base; dev; dev = dev->next) { ++ if (dev->type == type) { ++ dev_hold(dev); ++ break; ++ } ++ } ++ rtnl_unlock(); ++ return dev; ++} ++ ++EXPORT_SYMBOL(dev_getfirstbyhwtype); ++ ++/** ++ * dev_get_by_flags - find any device with given flags ++ * @if_flags: IFF_* values ++ * @mask: bitmask of bits in if_flags to check ++ * ++ * Search for any interface with the given flags. Returns NULL if a device ++ * is not found or a pointer to the device. The device returned has ++ * had a reference added and the pointer is safe until the user calls ++ * dev_put to indicate they have finished with it. ++ */ ++ ++struct net_device * dev_get_by_flags(unsigned short if_flags, unsigned short mask) ++{ ++ struct net_device *dev; ++ ++ read_lock(&dev_base_lock); ++ for (dev = dev_base; dev != NULL; dev = dev->next) { ++ if (((dev->flags ^ if_flags) & mask) == 0) { ++ dev_hold(dev); ++ break; ++ } ++ } ++ read_unlock(&dev_base_lock); ++ return dev; ++} ++ ++/** ++ * dev_valid_name - check if name is okay for network device ++ * @name: name string ++ * ++ * Network device names need to be valid file names to ++ * to allow sysfs to work ++ */ ++static int dev_valid_name(const char *name) ++{ ++ return !(*name == '\0' ++ || !strcmp(name, ".") ++ || !strcmp(name, "..") ++ || strchr(name, '/')); ++} ++ ++/** ++ * dev_alloc_name - allocate a name for a device ++ * @dev: device ++ * @name: name format string ++ * ++ * Passed a format string - eg "lt%d" it will try and find a suitable ++ * id. Not efficient for many devices, not called a lot. The caller ++ * must hold the dev_base or rtnl lock while allocating the name and ++ * adding the device in order to avoid duplicates. Returns the number ++ * of the unit assigned or a negative errno code. ++ */ ++ ++int dev_alloc_name(struct net_device *dev, const char *name) ++{ ++ int i = 0; ++ char buf[IFNAMSIZ]; ++ const char *p; ++ const int max_netdevices = 8*PAGE_SIZE; ++ long *inuse; ++ struct net_device *d; ++ ++ p = strnchr(name, IFNAMSIZ-1, '%'); ++ if (p) { ++ /* ++ * Verify the string as this thing may have come from ++ * the user. There must be either one "%d" and no other "%" ++ * characters. ++ */ ++ if (p[1] != 'd' || strchr(p + 2, '%')) ++ return -EINVAL; ++ ++ /* Use one page as a bit array of possible slots */ ++ inuse = (long *) get_zeroed_page(GFP_ATOMIC); ++ if (!inuse) ++ return -ENOMEM; ++ ++ for (d = dev_base; d; d = d->next) { ++ if (!sscanf(d->name, name, &i)) ++ continue; ++ if (i < 0 || i >= max_netdevices) ++ continue; ++ ++ /* avoid cases where sscanf is not exact inverse of printf */ ++ snprintf(buf, sizeof(buf), name, i); ++ if (!strncmp(buf, d->name, IFNAMSIZ)) ++ set_bit(i, inuse); ++ } ++ ++ i = find_first_zero_bit(inuse, max_netdevices); ++ free_page((unsigned long) inuse); ++ } ++ ++ snprintf(buf, sizeof(buf), name, i); ++ if (!__dev_get_by_name(buf)) { ++ strlcpy(dev->name, buf, IFNAMSIZ); ++ return i; ++ } ++ ++ /* It is possible to run out of possible slots ++ * when the name is long and there isn't enough space left ++ * for the digits, or if all bits are used. ++ */ ++ return -ENFILE; ++} ++ ++ ++/** ++ * dev_change_name - change name of a device ++ * @dev: device ++ * @newname: name (or format string) must be at least IFNAMSIZ ++ * ++ * Change name of a device, can pass format strings "eth%d". ++ * for wildcarding. ++ */ ++int dev_change_name(struct net_device *dev, char *newname) ++{ ++ int err = 0; ++ ++ ASSERT_RTNL(); ++ ++ if (dev->flags & IFF_UP) ++ return -EBUSY; ++ ++ if (!dev_valid_name(newname)) ++ return -EINVAL; ++ ++ if (strchr(newname, '%')) { ++ err = dev_alloc_name(dev, newname); ++ if (err < 0) ++ return err; ++ strcpy(newname, dev->name); ++ } ++ else if (__dev_get_by_name(newname)) ++ return -EEXIST; ++ else ++ strlcpy(dev->name, newname, IFNAMSIZ); ++ ++ err = class_device_rename(&dev->class_dev, dev->name); ++ if (!err) { ++ hlist_del(&dev->name_hlist); ++ hlist_add_head(&dev->name_hlist, dev_name_hash(dev->name)); ++ notifier_call_chain(&netdev_chain, NETDEV_CHANGENAME, dev); ++ } ++ ++ return err; ++} ++ ++/** ++ * netdev_features_change - device changes fatures ++ * @dev: device to cause notification ++ * ++ * Called to indicate a device has changed features. ++ */ ++void netdev_features_change(struct net_device *dev) ++{ ++ notifier_call_chain(&netdev_chain, NETDEV_FEAT_CHANGE, dev); ++} ++EXPORT_SYMBOL(netdev_features_change); ++ ++/** ++ * netdev_state_change - device changes state ++ * @dev: device to cause notification ++ * ++ * Called to indicate a device has changed state. This function calls ++ * the notifier chains for netdev_chain and sends a NEWLINK message ++ * to the routing socket. ++ */ ++void netdev_state_change(struct net_device *dev) ++{ ++ if (dev->flags & IFF_UP) { ++ notifier_call_chain(&netdev_chain, NETDEV_CHANGE, dev); ++ rtmsg_ifinfo(RTM_NEWLINK, dev, 0); ++ } ++} ++ ++/** ++ * dev_load - load a network module ++ * @name: name of interface ++ * ++ * If a network interface is not present and the process has suitable ++ * privileges this function loads the module. If module loading is not ++ * available in this kernel then it becomes a nop. ++ */ ++ ++void dev_load(const char *name) ++{ ++ struct net_device *dev; ++ ++ read_lock(&dev_base_lock); ++ dev = __dev_get_by_name(name); ++ read_unlock(&dev_base_lock); ++ ++ if (!dev && capable(CAP_SYS_MODULE)) ++ request_module("%s", name); ++} ++ ++static int default_rebuild_header(struct sk_buff *skb) ++{ ++ printk(KERN_DEBUG "%s: default_rebuild_header called -- BUG!\n", ++ skb->dev ? skb->dev->name : "NULL!!!"); ++ kfree_skb(skb); ++ return 1; ++} ++ ++ ++/** ++ * dev_open - prepare an interface for use. ++ * @dev: device to open ++ * ++ * Takes a device from down to up state. The device's private open ++ * function is invoked and then the multicast lists are loaded. Finally ++ * the device is moved into the up state and a %NETDEV_UP message is ++ * sent to the netdev notifier chain. ++ * ++ * Calling this function on an active interface is a nop. On a failure ++ * a negative errno code is returned. ++ */ ++int dev_open(struct net_device *dev) ++{ ++ int ret = 0; ++ ++ /* ++ * Is it already up? ++ */ ++ ++ if (dev->flags & IFF_UP) ++ return 0; ++ ++ /* ++ * Is it even present? ++ */ ++ if (!netif_device_present(dev)) ++ return -ENODEV; ++ ++ /* ++ * Call device private open method ++ */ ++ set_bit(__LINK_STATE_START, &dev->state); ++ if (dev->open) { ++ ret = dev->open(dev); ++ if (ret) ++ clear_bit(__LINK_STATE_START, &dev->state); ++ } ++ ++ /* ++ * If it went open OK then: ++ */ ++ ++ if (!ret) { ++ /* ++ * Set the flags. ++ */ ++ dev->flags |= IFF_UP; ++ ++ /* ++ * Initialize multicasting status ++ */ ++ dev_mc_upload(dev); ++ ++ /* ++ * Wakeup transmit queue engine ++ */ ++ dev_activate(dev); ++ ++ /* ++ * ... and announce new interface. ++ */ ++ notifier_call_chain(&netdev_chain, NETDEV_UP, dev); ++ } ++ return ret; ++} ++ ++/** ++ * dev_close - shutdown an interface. ++ * @dev: device to shutdown ++ * ++ * This function moves an active device into down state. A ++ * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device ++ * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier ++ * chain. ++ */ ++int dev_close(struct net_device *dev) ++{ ++ if (!(dev->flags & IFF_UP)) ++ return 0; ++ ++ /* ++ * Tell people we are going down, so that they can ++ * prepare to death, when device is still operating. ++ */ ++ notifier_call_chain(&netdev_chain, NETDEV_GOING_DOWN, dev); ++ ++ dev_deactivate(dev); ++ ++ clear_bit(__LINK_STATE_START, &dev->state); ++ ++ /* Synchronize to scheduled poll. We cannot touch poll list, ++ * it can be even on different cpu. So just clear netif_running(), ++ * and wait when poll really will happen. Actually, the best place ++ * for this is inside dev->stop() after device stopped its irq ++ * engine, but this requires more changes in devices. */ ++ ++ smp_mb__after_clear_bit(); /* Commit netif_running(). */ ++ while (test_bit(__LINK_STATE_RX_SCHED, &dev->state)) { ++ /* No hurry. */ ++ current->state = TASK_INTERRUPTIBLE; ++ schedule_timeout(1); ++ } ++ ++ /* ++ * Call the device specific close. This cannot fail. ++ * Only if device is UP ++ * ++ * We allow it to be called even after a DETACH hot-plug ++ * event. ++ */ ++ if (dev->stop) ++ dev->stop(dev); ++ ++ /* ++ * Device is now down. ++ */ ++ ++ dev->flags &= ~IFF_UP; ++ ++ /* ++ * Tell people we are down ++ */ ++ notifier_call_chain(&netdev_chain, NETDEV_DOWN, dev); ++ ++ return 0; ++} ++ ++ ++/* ++ * Device change register/unregister. These are not inline or static ++ * as we export them to the world. ++ */ ++ ++/** ++ * register_netdevice_notifier - register a network notifier block ++ * @nb: notifier ++ * ++ * Register a notifier to be called when network device events occur. ++ * The notifier passed is linked into the kernel structures and must ++ * not be reused until it has been unregistered. A negative errno code ++ * is returned on a failure. ++ * ++ * When registered all registration and up events are replayed ++ * to the new notifier to allow device to have a race free ++ * view of the network device list. ++ */ ++ ++int register_netdevice_notifier(struct notifier_block *nb) ++{ ++ struct net_device *dev; ++ int err; ++ ++ rtnl_lock(); ++ err = notifier_chain_register(&netdev_chain, nb); ++ if (!err) { ++ for (dev = dev_base; dev; dev = dev->next) { ++ nb->notifier_call(nb, NETDEV_REGISTER, dev); ++ ++ if (dev->flags & IFF_UP) ++ nb->notifier_call(nb, NETDEV_UP, dev); ++ } ++ } ++ rtnl_unlock(); ++ return err; ++} ++ ++/** ++ * unregister_netdevice_notifier - unregister a network notifier block ++ * @nb: notifier ++ * ++ * Unregister a notifier previously registered by ++ * register_netdevice_notifier(). The notifier is unlinked into the ++ * kernel structures and may then be reused. A negative errno code ++ * is returned on a failure. ++ */ ++ ++int unregister_netdevice_notifier(struct notifier_block *nb) ++{ ++ return notifier_chain_unregister(&netdev_chain, nb); ++} ++ ++/** ++ * call_netdevice_notifiers - call all network notifier blocks ++ * @val: value passed unmodified to notifier function ++ * @v: pointer passed unmodified to notifier function ++ * ++ * Call all network notifier blocks. Parameters and return value ++ * are as for notifier_call_chain(). ++ */ ++ ++int call_netdevice_notifiers(unsigned long val, void *v) ++{ ++ return notifier_call_chain(&netdev_chain, val, v); ++} ++ ++/* When > 0 there are consumers of rx skb time stamps */ ++static atomic_t netstamp_needed = ATOMIC_INIT(0); ++ ++void net_enable_timestamp(void) ++{ ++ atomic_inc(&netstamp_needed); ++} ++ ++void net_disable_timestamp(void) ++{ ++ atomic_dec(&netstamp_needed); ++} ++ ++static inline void net_timestamp(struct timeval *stamp) ++{ ++ if (atomic_read(&netstamp_needed)) ++ do_gettimeofday(stamp); ++ else { ++ stamp->tv_sec = 0; ++ stamp->tv_usec = 0; ++ } ++} ++ ++/* ++ * Support routine. Sends outgoing frames to any network ++ * taps currently in use. ++ */ ++ ++void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) ++{ ++ struct packet_type *ptype; ++ net_timestamp(&skb->stamp); ++ ++ rcu_read_lock(); ++ list_for_each_entry_rcu(ptype, &ptype_all, list) { ++ /* Never send packets back to the socket ++ * they originated from - MvS (miquels@drinkel.ow.org) ++ */ ++ if ((ptype->dev == dev || !ptype->dev) && ++ (ptype->af_packet_priv == NULL || ++ (struct sock *)ptype->af_packet_priv != skb->sk)) { ++ struct sk_buff *skb2= skb_clone(skb, GFP_ATOMIC); ++ if (!skb2) ++ break; ++ ++ /* skb->nh should be correctly ++ set by sender, so that the second statement is ++ just protection against buggy protocols. ++ */ ++ skb2->mac.raw = skb2->data; ++ ++ if (skb2->nh.raw < skb2->data || ++ skb2->nh.raw > skb2->tail) { ++ if (net_ratelimit()) ++ printk(KERN_CRIT "protocol %04x is " ++ "buggy, dev %s\n", ++ skb2->protocol, dev->name); ++ skb2->nh.raw = skb2->data; ++ } ++ ++ skb2->h.raw = skb2->nh.raw; ++ skb2->pkt_type = PACKET_OUTGOING; ++ ptype->func(skb2, skb->dev, ptype); ++ } ++ } ++ rcu_read_unlock(); ++} ++ ++/* ++ * Invalidate hardware checksum when packet is to be mangled, and ++ * complete checksum manually on outgoing path. ++ */ ++int skb_checksum_help(struct sk_buff *skb, int inward) ++{ ++ unsigned int csum; ++ int ret = 0, offset = skb->h.raw - skb->data; ++ ++ if (inward) { ++ skb->ip_summed = CHECKSUM_NONE; ++ goto out; ++ } ++ ++ if (skb_cloned(skb)) { ++ ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC); ++ if (ret) ++ goto out; ++ } ++ ++ if (offset > (int)skb->len) ++ BUG(); ++ csum = skb_checksum(skb, offset, skb->len-offset, 0); ++ ++ offset = skb->tail - skb->h.raw; ++ if (offset <= 0) ++ BUG(); ++ if (skb->csum + 2 > offset) ++ BUG(); ++ ++ *(u16*)(skb->h.raw + skb->csum) = csum_fold(csum); ++ skb->ip_summed = CHECKSUM_NONE; ++out: ++ return ret; ++} ++ ++#ifdef CONFIG_HIGHMEM ++/* Actually, we should eliminate this check as soon as we know, that: ++ * 1. IOMMU is present and allows to map all the memory. ++ * 2. No high memory really exists on this machine. ++ */ ++ ++static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb) ++{ ++ int i; ++ ++ if (dev->features & NETIF_F_HIGHDMA) ++ return 0; ++ ++ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) ++ if (PageHighMem(skb_shinfo(skb)->frags[i].page)) ++ return 1; ++ ++ return 0; ++} ++#else ++#define illegal_highdma(dev, skb) (0) ++#endif ++ ++extern void skb_release_data(struct sk_buff *); ++ ++/* Keep head the same: replace data */ ++int __skb_linearize(struct sk_buff *skb, int gfp_mask) ++{ ++ unsigned int size; ++ u8 *data; ++ long offset; ++ struct skb_shared_info *ninfo; ++ int headerlen = skb->data - skb->head; ++ int expand = (skb->tail + skb->data_len) - skb->end; ++ ++ if (skb_shared(skb)) ++ BUG(); ++ ++ if (expand <= 0) ++ expand = 0; ++ ++ size = skb->end - skb->head + expand; ++ size = SKB_DATA_ALIGN(size); ++ data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask); ++ if (!data) ++ return -ENOMEM; ++ ++ /* Copy entire thing */ ++ if (skb_copy_bits(skb, -headerlen, data, headerlen + skb->len)) ++ BUG(); ++ ++ /* Set up shinfo */ ++ ninfo = (struct skb_shared_info*)(data + size); ++ atomic_set(&ninfo->dataref, 1); ++ ninfo->tso_size = skb_shinfo(skb)->tso_size; ++ ninfo->tso_segs = skb_shinfo(skb)->tso_segs; ++ ninfo->nr_frags = 0; ++ ninfo->frag_list = NULL; ++ ++ /* Offset between the two in bytes */ ++ offset = data - skb->head; ++ ++ /* Free old data. */ ++ skb_release_data(skb); ++ ++ skb->head = data; ++ skb->end = data + size; ++ ++ /* Set up new pointers */ ++ skb->h.raw += offset; ++ skb->nh.raw += offset; ++ skb->mac.raw += offset; ++ skb->tail += offset; ++ skb->data += offset; ++ ++ /* We are no longer a clone, even if we were. */ ++ skb->cloned = 0; ++ ++ skb->tail += skb->data_len; ++ skb->data_len = 0; ++ return 0; ++} ++ ++#define HARD_TX_LOCK(dev, cpu) { \ ++ if ((dev->features & NETIF_F_LLTX) == 0) { \ ++ spin_lock(&dev->xmit_lock); \ ++ dev->xmit_lock_owner = cpu; \ ++ } \ ++} ++ ++#define HARD_TX_UNLOCK(dev) { \ ++ if ((dev->features & NETIF_F_LLTX) == 0) { \ ++ dev->xmit_lock_owner = -1; \ ++ spin_unlock(&dev->xmit_lock); \ ++ } \ ++} ++ ++/** ++ * dev_queue_xmit - transmit a buffer ++ * @skb: buffer to transmit ++ * ++ * Queue a buffer for transmission to a network device. The caller must ++ * have set the device and priority and built the buffer before calling ++ * this function. The function can be called from an interrupt. ++ * ++ * A negative errno code is returned on a failure. A success does not ++ * guarantee the frame will be transmitted as it may be dropped due ++ * to congestion or traffic shaping. ++ * ++ * ----------------------------------------------------------------------------------- ++ * I notice this method can also return errors from the queue disciplines, ++ * including NET_XMIT_DROP, which is a positive value. So, errors can also ++ * be positive. ++ * ++ * Regardless of the return value, the skb is consumed, so it is currently ++ * difficult to retry a send to this method. (You can bump the ref count ++ * before sending to hold a reference for retry if you are careful.) ++ * ++ * When calling this method, interrupts MUST be enabled. This is because ++ * the BH enable code must have IRQs enabled so that it will not deadlock. ++ * --BLG ++ */ ++ ++int dev_queue_xmit(struct sk_buff *skb) ++{ ++ struct net_device *dev = skb->dev; ++ struct Qdisc *q; ++ int rc = -ENOMEM; ++ ++ if (skb_shinfo(skb)->frag_list && ++ !(dev->features & NETIF_F_FRAGLIST) && ++ __skb_linearize(skb, GFP_ATOMIC)) ++ goto out_kfree_skb; ++ ++ /* Fragmented skb is linearized if device does not support SG, ++ * or if at least one of fragments is in highmem and device ++ * does not support DMA from it. ++ */ ++ if (skb_shinfo(skb)->nr_frags && ++ (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) && ++ __skb_linearize(skb, GFP_ATOMIC)) ++ goto out_kfree_skb; ++ ++ /* If packet is not checksummed and device does not support ++ * checksumming for this protocol, complete checksumming here. ++ */ ++ if (skb->ip_summed == CHECKSUM_HW && ++ (!(dev->features & (NETIF_F_HW_CSUM | NETIF_F_NO_CSUM)) && ++ (!(dev->features & NETIF_F_IP_CSUM) || ++ skb->protocol != htons(ETH_P_IP)))) ++ if (skb_checksum_help(skb, 0)) ++ goto out_kfree_skb; ++ ++ /* Disable soft irqs for various locks below. Also ++ * stops preemption for RCU. ++ */ ++ local_bh_disable(); ++ ++ /* Updates of qdisc are serialized by queue_lock. ++ * The struct Qdisc which is pointed to by qdisc is now a ++ * rcu structure - it may be accessed without acquiring ++ * a lock (but the structure may be stale.) The freeing of the ++ * qdisc will be deferred until it's known that there are no ++ * more references to it. ++ * ++ * If the qdisc has an enqueue function, we still need to ++ * hold the queue_lock before calling it, since queue_lock ++ * also serializes access to the device queue. ++ */ ++ ++ q = rcu_dereference(dev->qdisc); ++#ifdef CONFIG_NET_CLS_ACT ++ skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_EGRESS); ++#endif ++ if (q->enqueue) { ++ /* Grab device queue */ ++ spin_lock(&dev->queue_lock); ++ ++ rc = q->enqueue(skb, q); ++ ++ qdisc_run(dev); ++ ++ spin_unlock(&dev->queue_lock); ++ rc = rc == NET_XMIT_BYPASS ? NET_XMIT_SUCCESS : rc; ++ goto out; ++ } ++ ++ /* The device has no queue. Common case for software devices: ++ loopback, all the sorts of tunnels... ++ ++ Really, it is unlikely that xmit_lock protection is necessary here. ++ (f.e. loopback and IP tunnels are clean ignoring statistics ++ counters.) ++ However, it is possible, that they rely on protection ++ made by us here. ++ ++ Check this and shot the lock. It is not prone from deadlocks. ++ Either shot noqueue qdisc, it is even simpler 8) ++ */ ++ if (dev->flags & IFF_UP) { ++ int cpu = smp_processor_id(); /* ok because BHs are off */ ++ ++ if (dev->xmit_lock_owner != cpu) { ++ ++ HARD_TX_LOCK(dev, cpu); ++ ++ if (!netif_queue_stopped(dev)) { ++ if (netdev_nit) ++ dev_queue_xmit_nit(skb, dev); ++ ++ rc = 0; ++ if (!dev->hard_start_xmit(skb, dev)) { ++ HARD_TX_UNLOCK(dev); ++ goto out; ++ } ++ } ++ HARD_TX_UNLOCK(dev); ++ if (net_ratelimit()) ++ printk(KERN_CRIT "Virtual device %s asks to " ++ "queue packet!\n", dev->name); ++ } else { ++ /* Recursion is detected! It is possible, ++ * unfortunately */ ++ if (net_ratelimit()) ++ printk(KERN_CRIT "Dead loop on virtual device " ++ "%s, fix it urgently!\n", dev->name); ++ } ++ } ++ ++ rc = -ENETDOWN; ++ local_bh_enable(); ++ ++out_kfree_skb: ++ kfree_skb(skb); ++ return rc; ++out: ++ local_bh_enable(); ++ return rc; ++} ++ ++ ++/*======================================================================= ++ Receiver routines ++ =======================================================================*/ ++ ++int netdev_max_backlog = 300; ++int weight_p = 64; /* old backlog weight */ ++/* These numbers are selected based on intuition and some ++ * experimentatiom, if you have more scientific way of doing this ++ * please go ahead and fix things. ++ */ ++int no_cong_thresh = 10; ++int no_cong = 20; ++int lo_cong = 100; ++int mod_cong = 290; ++ ++DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, }; ++ ++ ++static void get_sample_stats(int cpu) ++{ ++#ifdef RAND_LIE ++ unsigned long rd; ++ int rq; ++#endif ++ struct softnet_data *sd = &per_cpu(softnet_data, cpu); ++ int blog = sd->input_pkt_queue.qlen; ++ int avg_blog = sd->avg_blog; ++ ++ avg_blog = (avg_blog >> 1) + (blog >> 1); ++ ++ if (avg_blog > mod_cong) { ++ /* Above moderate congestion levels. */ ++ sd->cng_level = NET_RX_CN_HIGH; ++#ifdef RAND_LIE ++ rd = net_random(); ++ rq = rd % netdev_max_backlog; ++ if (rq < avg_blog) /* unlucky bastard */ ++ sd->cng_level = NET_RX_DROP; ++#endif ++ } else if (avg_blog > lo_cong) { ++ sd->cng_level = NET_RX_CN_MOD; ++#ifdef RAND_LIE ++ rd = net_random(); ++ rq = rd % netdev_max_backlog; ++ if (rq < avg_blog) /* unlucky bastard */ ++ sd->cng_level = NET_RX_CN_HIGH; ++#endif ++ } else if (avg_blog > no_cong) ++ sd->cng_level = NET_RX_CN_LOW; ++ else /* no congestion */ ++ sd->cng_level = NET_RX_SUCCESS; ++ ++ sd->avg_blog = avg_blog; ++} ++ ++#ifdef OFFLINE_SAMPLE ++static void sample_queue(unsigned long dummy) ++{ ++/* 10 ms 0r 1ms -- i don't care -- JHS */ ++ int next_tick = 1; ++ int cpu = smp_processor_id(); ++ ++ get_sample_stats(cpu); ++ next_tick += jiffies; ++ mod_timer(&samp_timer, next_tick); ++} ++#endif ++ ++ ++/** ++ * netif_rx - post buffer to the network code ++ * @skb: buffer to post ++ * ++ * This function receives a packet from a device driver and queues it for ++ * the upper (protocol) levels to process. It always succeeds. The buffer ++ * may be dropped during processing for congestion control or by the ++ * protocol layers. ++ * ++ * return values: ++ * NET_RX_SUCCESS (no congestion) ++ * NET_RX_CN_LOW (low congestion) ++ * NET_RX_CN_MOD (moderate congestion) ++ * NET_RX_CN_HIGH (high congestion) ++ * NET_RX_DROP (packet was dropped) ++ * ++ */ ++ ++int netif_rx(struct sk_buff *skb) ++{ ++ int this_cpu; ++ struct softnet_data *queue; ++ unsigned long flags; ++ ++ /* if netpoll wants it, pretend we never saw it */ ++ if (netpoll_rx(skb)) ++ return NET_RX_DROP; ++ ++ if (!skb->stamp.tv_sec) ++ net_timestamp(&skb->stamp); ++ ++ /* ++ * The code is rearranged so that the path is the most ++ * short when CPU is congested, but is still operating. ++ */ ++ local_irq_save(flags); ++ this_cpu = smp_processor_id(); ++ queue = &__get_cpu_var(softnet_data); ++ ++ __get_cpu_var(netdev_rx_stat).total++; ++ if (queue->input_pkt_queue.qlen <= netdev_max_backlog) { ++ if (queue->input_pkt_queue.qlen) { ++ if (queue->throttle) ++ goto drop; ++ ++enqueue: ++ dev_hold(skb->dev); ++ __skb_queue_tail(&queue->input_pkt_queue, skb); ++#ifndef OFFLINE_SAMPLE ++ get_sample_stats(this_cpu); ++#endif ++ local_irq_restore(flags); ++ return queue->cng_level; ++ } ++ ++ if (queue->throttle) ++ queue->throttle = 0; ++ ++ netif_rx_schedule(&queue->backlog_dev); ++ goto enqueue; ++ } ++ ++ if (!queue->throttle) { ++ queue->throttle = 1; ++ __get_cpu_var(netdev_rx_stat).throttled++; ++ } ++ ++drop: ++ __get_cpu_var(netdev_rx_stat).dropped++; ++ local_irq_restore(flags); ++ ++ kfree_skb(skb); ++ return NET_RX_DROP; ++} ++ ++int netif_rx_ni(struct sk_buff *skb) ++{ ++ int err; ++ ++ preempt_disable(); ++ err = netif_rx(skb); ++ if (local_softirq_pending()) ++ do_softirq(); ++ preempt_enable(); ++ ++ return err; ++} ++ ++EXPORT_SYMBOL(netif_rx_ni); ++ ++static __inline__ void skb_bond(struct sk_buff *skb) ++{ ++ struct net_device *dev = skb->dev; ++ ++ if (dev->master) { ++ skb->real_dev = skb->dev; ++ skb->dev = dev->master; ++ } ++} ++ ++static void net_tx_action(struct softirq_action *h) ++{ ++ struct softnet_data *sd = &__get_cpu_var(softnet_data); ++ ++ if (sd->completion_queue) { ++ struct sk_buff *clist; ++ ++ local_irq_disable(); ++ clist = sd->completion_queue; ++ sd->completion_queue = NULL; ++ local_irq_enable(); ++ ++ while (clist) { ++ struct sk_buff *skb = clist; ++ clist = clist->next; ++ ++ BUG_TRAP(!atomic_read(&skb->users)); ++ __kfree_skb(skb); ++ } ++ } ++ ++ if (sd->output_queue) { ++ struct net_device *head; ++ ++ local_irq_disable(); ++ head = sd->output_queue; ++ sd->output_queue = NULL; ++ local_irq_enable(); ++ ++ while (head) { ++ struct net_device *dev = head; ++ head = head->next_sched; ++ ++ smp_mb__before_clear_bit(); ++ clear_bit(__LINK_STATE_SCHED, &dev->state); ++ ++ if (spin_trylock(&dev->queue_lock)) { ++ qdisc_run(dev); ++ spin_unlock(&dev->queue_lock); ++ } else { ++ netif_schedule(dev); ++ } ++ } ++ } ++} ++ ++static __inline__ int deliver_skb(struct sk_buff *skb, ++ struct packet_type *pt_prev) ++{ ++ atomic_inc(&skb->users); ++ return pt_prev->func(skb, skb->dev, pt_prev); ++} ++ ++#if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE) ++int (*br_handle_frame_hook)(struct net_bridge_port *p, struct sk_buff **pskb); ++struct net_bridge; ++struct net_bridge_fdb_entry *(*br_fdb_get_hook)(struct net_bridge *br, ++ unsigned char *addr); ++void (*br_fdb_put_hook)(struct net_bridge_fdb_entry *ent); ++ ++static __inline__ int handle_bridge(struct sk_buff **pskb, ++ struct packet_type **pt_prev, int *ret) ++{ ++ struct net_bridge_port *port; ++ ++ if ((*pskb)->pkt_type == PACKET_LOOPBACK || ++ (port = rcu_dereference((*pskb)->dev->br_port)) == NULL) ++ return 0; ++ ++ if (*pt_prev) { ++ *ret = deliver_skb(*pskb, *pt_prev); ++ *pt_prev = NULL; ++ } ++ ++ return br_handle_frame_hook(port, pskb); ++} ++#else ++#define handle_bridge(skb, pt_prev, ret) (0) ++#endif ++ ++#ifdef CONFIG_NET_CLS_ACT ++/* TODO: Maybe we should just force sch_ingress to be compiled in ++ * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions ++ * a compare and 2 stores extra right now if we dont have it on ++ * but have CONFIG_NET_CLS_ACT ++ * NOTE: This doesnt stop any functionality; if you dont have ++ * the ingress scheduler, you just cant add policies on ingress. ++ * ++ */ ++static int ing_filter(struct sk_buff *skb) ++{ ++ struct Qdisc *q; ++ struct net_device *dev = skb->dev; ++ int result = TC_ACT_OK; ++ ++ if (dev->qdisc_ingress) { ++ __u32 ttl = (__u32) G_TC_RTTL(skb->tc_verd); ++ if (MAX_RED_LOOP < ttl++) { ++ printk("Redir loop detected Dropping packet (%s->%s)\n", ++ skb->input_dev?skb->input_dev->name:"??",skb->dev->name); ++ return TC_ACT_SHOT; ++ } ++ ++ skb->tc_verd = SET_TC_RTTL(skb->tc_verd,ttl); ++ ++ skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_INGRESS); ++ if (NULL == skb->input_dev) { ++ skb->input_dev = skb->dev; ++ printk("ing_filter: fixed %s out %s\n",skb->input_dev->name,skb->dev->name); ++ } ++ spin_lock(&dev->ingress_lock); ++ if ((q = dev->qdisc_ingress) != NULL) ++ result = q->enqueue(skb, q); ++ spin_unlock(&dev->ingress_lock); ++ ++ } ++ ++ return result; ++} ++#endif ++ ++int netif_receive_skb(struct sk_buff *skb) ++{ ++ struct packet_type *ptype, *pt_prev; ++ int ret = NET_RX_DROP; ++ unsigned short type; ++ ++ /* if we've gotten here through NAPI, check netpoll */ ++ if (skb->dev->poll && netpoll_rx(skb)) ++ return NET_RX_DROP; ++ ++ if (!skb->stamp.tv_sec) ++ net_timestamp(&skb->stamp); ++ ++ skb_bond(skb); ++ ++ __get_cpu_var(netdev_rx_stat).total++; ++ ++ skb->h.raw = skb->nh.raw = skb->data; ++ skb->mac_len = skb->nh.raw - skb->mac.raw; ++ ++ pt_prev = NULL; ++ ++ rcu_read_lock(); ++ ++#ifdef CONFIG_NET_CLS_ACT ++ if (skb->tc_verd & TC_NCLS) { ++ skb->tc_verd = CLR_TC_NCLS(skb->tc_verd); ++ goto ncls; ++ } ++#endif ++ ++ list_for_each_entry_rcu(ptype, &ptype_all, list) { ++ if (!ptype->dev || ptype->dev == skb->dev) { ++ if (pt_prev) ++ ret = deliver_skb(skb, pt_prev); ++ pt_prev = ptype; ++ } ++ } ++ ++#ifdef CONFIG_NET_CLS_ACT ++ if (pt_prev) { ++ ret = deliver_skb(skb, pt_prev); ++ pt_prev = NULL; /* noone else should process this after*/ ++ } else { ++ skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd); ++ } ++ ++ ret = ing_filter(skb); ++ ++ if (ret == TC_ACT_SHOT || (ret == TC_ACT_STOLEN)) { ++ kfree_skb(skb); ++ goto out; ++ } ++ ++ skb->tc_verd = 0; ++ncls: ++#endif ++ ++ handle_diverter(skb); ++ ++ if (handle_bridge(&skb, &pt_prev, &ret)) ++ goto out; ++ ++ type = skb->protocol; ++ list_for_each_entry_rcu(ptype, &ptype_base[ntohs(type)&15], list) { ++ if (ptype->type == type && ++ (!ptype->dev || ptype->dev == skb->dev)) { ++ if (pt_prev) ++ ret = deliver_skb(skb, pt_prev); ++ pt_prev = ptype; ++ } ++ } ++ ++ if (pt_prev) { ++ ret = pt_prev->func(skb, skb->dev, pt_prev); ++ } else { ++ kfree_skb(skb); ++ /* Jamal, now you will not able to escape explaining ++ * me how you were going to use this. :-) ++ */ ++ ret = NET_RX_DROP; ++ } ++ ++out: ++ rcu_read_unlock(); ++ return ret; ++} ++ ++static int process_backlog(struct net_device *backlog_dev, int *budget) ++{ ++ int work = 0; ++ int quota = min(backlog_dev->quota, *budget); ++ struct softnet_data *queue = &__get_cpu_var(softnet_data); ++ unsigned long start_time = jiffies; ++ ++ backlog_dev->weight = weight_p; ++ for (;;) { ++ struct sk_buff *skb; ++ struct net_device *dev; ++ ++ local_irq_disable(); ++ skb = __skb_dequeue(&queue->input_pkt_queue); ++ if (!skb) ++ goto job_done; ++ local_irq_enable(); ++ ++ dev = skb->dev; ++ ++ netif_receive_skb(skb); ++ ++ dev_put(dev); ++ ++ work++; ++ ++ if (work >= quota || jiffies - start_time > 1) ++ break; ++ ++ } ++ ++ backlog_dev->quota -= work; ++ *budget -= work; ++ return -1; ++ ++job_done: ++ backlog_dev->quota -= work; ++ *budget -= work; ++ ++ list_del(&backlog_dev->poll_list); ++ smp_mb__before_clear_bit(); ++ netif_poll_enable(backlog_dev); ++ ++ if (queue->throttle) ++ queue->throttle = 0; ++ local_irq_enable(); ++ return 0; ++} ++ ++static void net_rx_action(struct softirq_action *h) ++{ ++ struct softnet_data *queue = &__get_cpu_var(softnet_data); ++ unsigned long start_time = jiffies; ++ int budget = netdev_max_backlog; ++ ++ ++ local_irq_disable(); ++ ++ while (!list_empty(&queue->poll_list)) { ++ struct net_device *dev; ++ ++ if (budget <= 0 || jiffies - start_time > 1) ++ goto softnet_break; ++ ++ local_irq_enable(); ++ ++ dev = list_entry(queue->poll_list.next, ++ struct net_device, poll_list); ++ netpoll_poll_lock(dev); ++ ++ if (dev->quota <= 0 || dev->poll(dev, &budget)) { ++ netpoll_poll_unlock(dev); ++ local_irq_disable(); ++ list_del(&dev->poll_list); ++ list_add_tail(&dev->poll_list, &queue->poll_list); ++ if (dev->quota < 0) ++ dev->quota += dev->weight; ++ else ++ dev->quota = dev->weight; ++ } else { ++ netpoll_poll_unlock(dev); ++ dev_put(dev); ++ local_irq_disable(); ++ } ++ } ++out: ++ local_irq_enable(); ++ return; ++ ++softnet_break: ++ __get_cpu_var(netdev_rx_stat).time_squeeze++; ++ __raise_softirq_irqoff(NET_RX_SOFTIRQ); ++ goto out; ++} ++ ++static gifconf_func_t * gifconf_list [NPROTO]; ++ ++/** ++ * register_gifconf - register a SIOCGIF handler ++ * @family: Address family ++ * @gifconf: Function handler ++ * ++ * Register protocol dependent address dumping routines. The handler ++ * that is passed must not be freed or reused until it has been replaced ++ * by another handler. ++ */ ++int register_gifconf(unsigned int family, gifconf_func_t * gifconf) ++{ ++ if (family >= NPROTO) ++ return -EINVAL; ++ gifconf_list[family] = gifconf; ++ return 0; ++} ++ ++ ++/* ++ * Map an interface index to its name (SIOCGIFNAME) ++ */ ++ ++/* ++ * We need this ioctl for efficient implementation of the ++ * if_indextoname() function required by the IPv6 API. Without ++ * it, we would have to search all the interfaces to find a ++ * match. --pb ++ */ ++ ++static int dev_ifname(struct ifreq __user *arg) ++{ ++ struct net_device *dev; ++ struct ifreq ifr; ++ ++ /* ++ * Fetch the caller's info block. ++ */ ++ ++ if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) ++ return -EFAULT; ++ ++ read_lock(&dev_base_lock); ++ dev = __dev_get_by_index(ifr.ifr_ifindex); ++ if (!dev) { ++ read_unlock(&dev_base_lock); ++ return -ENODEV; ++ } ++ ++ strcpy(ifr.ifr_name, dev->name); ++ read_unlock(&dev_base_lock); ++ ++ if (copy_to_user(arg, &ifr, sizeof(struct ifreq))) ++ return -EFAULT; ++ return 0; ++} ++ ++/* ++ * Perform a SIOCGIFCONF call. This structure will change ++ * size eventually, and there is nothing I can do about it. ++ * Thus we will need a 'compatibility mode'. ++ */ ++ ++static int dev_ifconf(char __user *arg) ++{ ++ struct ifconf ifc; ++ struct net_device *dev; ++ char __user *pos; ++ int len; ++ int total; ++ int i; ++ ++ /* ++ * Fetch the caller's info block. ++ */ ++ ++ if (copy_from_user(&ifc, arg, sizeof(struct ifconf))) ++ return -EFAULT; ++ ++ pos = ifc.ifc_buf; ++ len = ifc.ifc_len; ++ ++ /* ++ * Loop over the interfaces, and write an info block for each. ++ */ ++ ++ total = 0; ++ for (dev = dev_base; dev; dev = dev->next) { ++ for (i = 0; i < NPROTO; i++) { ++ if (gifconf_list[i]) { ++ int done; ++ if (!pos) ++ done = gifconf_list[i](dev, NULL, 0); ++ else ++ done = gifconf_list[i](dev, pos + total, ++ len - total); ++ if (done < 0) ++ return -EFAULT; ++ total += done; ++ } ++ } ++ } ++ ++ /* ++ * All done. Write the updated control block back to the caller. ++ */ ++ ifc.ifc_len = total; ++ ++ /* ++ * Both BSD and Solaris return 0 here, so we do too. ++ */ ++ return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0; ++} ++ ++#ifdef CONFIG_PROC_FS ++/* ++ * This is invoked by the /proc filesystem handler to display a device ++ * in detail. ++ */ ++static __inline__ struct net_device *dev_get_idx(loff_t pos) ++{ ++ struct net_device *dev; ++ loff_t i; ++ ++ for (i = 0, dev = dev_base; dev && i < pos; ++i, dev = dev->next); ++ ++ return i == pos ? dev : NULL; ++} ++ ++void *dev_seq_start(struct seq_file *seq, loff_t *pos) ++{ ++ read_lock(&dev_base_lock); ++ return *pos ? dev_get_idx(*pos - 1) : SEQ_START_TOKEN; ++} ++ ++void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos) ++{ ++ ++*pos; ++ return v == SEQ_START_TOKEN ? dev_base : ((struct net_device *)v)->next; ++} ++ ++void dev_seq_stop(struct seq_file *seq, void *v) ++{ ++ read_unlock(&dev_base_lock); ++} ++ ++static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev) ++{ ++ if (dev->get_stats) { ++ struct net_device_stats *stats = dev->get_stats(dev); ++ ++ seq_printf(seq, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu " ++ "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n", ++ dev->name, stats->rx_bytes, stats->rx_packets, ++ stats->rx_errors, ++ stats->rx_dropped + stats->rx_missed_errors, ++ stats->rx_fifo_errors, ++ stats->rx_length_errors + stats->rx_over_errors + ++ stats->rx_crc_errors + stats->rx_frame_errors, ++ stats->rx_compressed, stats->multicast, ++ stats->tx_bytes, stats->tx_packets, ++ stats->tx_errors, stats->tx_dropped, ++ stats->tx_fifo_errors, stats->collisions, ++ stats->tx_carrier_errors + ++ stats->tx_aborted_errors + ++ stats->tx_window_errors + ++ stats->tx_heartbeat_errors, ++ stats->tx_compressed); ++ } else ++ seq_printf(seq, "%6s: No statistics available.\n", dev->name); ++} ++ ++/* ++ * Called from the PROCfs module. This now uses the new arbitrary sized ++ * /proc/net interface to create /proc/net/dev ++ */ ++static int dev_seq_show(struct seq_file *seq, void *v) ++{ ++ if (v == SEQ_START_TOKEN) ++ seq_puts(seq, "Inter-| Receive " ++ " | Transmit\n" ++ " face |bytes packets errs drop fifo frame " ++ "compressed multicast|bytes packets errs " ++ "drop fifo colls carrier compressed\n"); ++ else ++ dev_seq_printf_stats(seq, v); ++ return 0; ++} ++ ++static struct netif_rx_stats *softnet_get_online(loff_t *pos) ++{ ++ struct netif_rx_stats *rc = NULL; ++ ++ while (*pos < NR_CPUS) ++ if (cpu_online(*pos)) { ++ rc = &per_cpu(netdev_rx_stat, *pos); ++ break; ++ } else ++ ++*pos; ++ return rc; ++} ++ ++static void *softnet_seq_start(struct seq_file *seq, loff_t *pos) ++{ ++ return softnet_get_online(pos); ++} ++ ++static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos) ++{ ++ ++*pos; ++ return softnet_get_online(pos); ++} ++ ++static void softnet_seq_stop(struct seq_file *seq, void *v) ++{ ++} ++ ++static int softnet_seq_show(struct seq_file *seq, void *v) ++{ ++ struct netif_rx_stats *s = v; ++ ++ seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n", ++ s->total, s->dropped, s->time_squeeze, s->throttled, ++ s->fastroute_hit, s->fastroute_success, s->fastroute_defer, ++ s->fastroute_deferred_out, ++#if 0 ++ s->fastroute_latency_reduction ++#else ++ s->cpu_collision ++#endif ++ ); ++ return 0; ++} ++ ++static struct seq_operations dev_seq_ops = { ++ .start = dev_seq_start, ++ .next = dev_seq_next, ++ .stop = dev_seq_stop, ++ .show = dev_seq_show, ++}; ++ ++static int dev_seq_open(struct inode *inode, struct file *file) ++{ ++ return seq_open(file, &dev_seq_ops); ++} ++ ++static struct file_operations dev_seq_fops = { ++ .owner = THIS_MODULE, ++ .open = dev_seq_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = seq_release, ++}; ++ ++static struct seq_operations softnet_seq_ops = { ++ .start = softnet_seq_start, ++ .next = softnet_seq_next, ++ .stop = softnet_seq_stop, ++ .show = softnet_seq_show, ++}; ++ ++static int softnet_seq_open(struct inode *inode, struct file *file) ++{ ++ return seq_open(file, &softnet_seq_ops); ++} ++ ++static struct file_operations softnet_seq_fops = { ++ .owner = THIS_MODULE, ++ .open = softnet_seq_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = seq_release, ++}; ++ ++#ifdef WIRELESS_EXT ++extern int wireless_proc_init(void); ++#else ++#define wireless_proc_init() 0 ++#endif ++ ++static int __init dev_proc_init(void) ++{ ++ int rc = -ENOMEM; ++ ++ if (!proc_net_fops_create("dev", S_IRUGO, &dev_seq_fops)) ++ goto out; ++ if (!proc_net_fops_create("softnet_stat", S_IRUGO, &softnet_seq_fops)) ++ goto out_dev; ++ if (wireless_proc_init()) ++ goto out_softnet; ++ rc = 0; ++out: ++ return rc; ++out_softnet: ++ proc_net_remove("softnet_stat"); ++out_dev: ++ proc_net_remove("dev"); ++ goto out; ++} ++#else ++#define dev_proc_init() 0 ++#endif /* CONFIG_PROC_FS */ ++ ++ ++/** ++ * netdev_set_master - set up master/slave pair ++ * @slave: slave device ++ * @master: new master device ++ * ++ * Changes the master device of the slave. Pass %NULL to break the ++ * bonding. The caller must hold the RTNL semaphore. On a failure ++ * a negative errno code is returned. On success the reference counts ++ * are adjusted, %RTM_NEWLINK is sent to the routing socket and the ++ * function returns zero. ++ */ ++int netdev_set_master(struct net_device *slave, struct net_device *master) ++{ ++ struct net_device *old = slave->master; ++ ++ ASSERT_RTNL(); ++ ++ if (master) { ++ if (old) ++ return -EBUSY; ++ dev_hold(master); ++ } ++ ++ slave->master = master; ++ ++ synchronize_net(); ++ ++ if (old) ++ dev_put(old); ++ ++ if (master) ++ slave->flags |= IFF_SLAVE; ++ else ++ slave->flags &= ~IFF_SLAVE; ++ ++ rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE); ++ return 0; ++} ++ ++/** ++ * dev_set_promiscuity - update promiscuity count on a device ++ * @dev: device ++ * @inc: modifier ++ * ++ * Add or remove promsicuity from a device. While the count in the device ++ * remains above zero the interface remains promiscuous. Once it hits zero ++ * the device reverts back to normal filtering operation. A negative inc ++ * value is used to drop promiscuity on the device. ++ */ ++void dev_set_promiscuity(struct net_device *dev, int inc) ++{ ++ unsigned short old_flags = dev->flags; ++ ++ dev->flags |= IFF_PROMISC; ++ if ((dev->promiscuity += inc) == 0) ++ dev->flags &= ~IFF_PROMISC; ++ if (dev->flags ^ old_flags) { ++ dev_mc_upload(dev); ++ printk(KERN_INFO "device %s %s promiscuous mode\n", ++ dev->name, (dev->flags & IFF_PROMISC) ? "entered" : ++ "left"); ++ } ++} ++ ++/** ++ * dev_set_allmulti - update allmulti count on a device ++ * @dev: device ++ * @inc: modifier ++ * ++ * Add or remove reception of all multicast frames to a device. While the ++ * count in the device remains above zero the interface remains listening ++ * to all interfaces. Once it hits zero the device reverts back to normal ++ * filtering operation. A negative @inc value is used to drop the counter ++ * when releasing a resource needing all multicasts. ++ */ ++ ++void dev_set_allmulti(struct net_device *dev, int inc) ++{ ++ unsigned short old_flags = dev->flags; ++ ++ dev->flags |= IFF_ALLMULTI; ++ if ((dev->allmulti += inc) == 0) ++ dev->flags &= ~IFF_ALLMULTI; ++ if (dev->flags ^ old_flags) ++ dev_mc_upload(dev); ++} ++ ++unsigned dev_get_flags(const struct net_device *dev) ++{ ++ unsigned flags; ++ ++ flags = (dev->flags & ~(IFF_PROMISC | ++ IFF_ALLMULTI | ++ IFF_RUNNING)) | ++ (dev->gflags & (IFF_PROMISC | ++ IFF_ALLMULTI)); ++ ++ if (netif_running(dev) && netif_carrier_ok(dev)) ++ flags |= IFF_RUNNING; ++ ++ return flags; ++} ++ ++int dev_change_flags(struct net_device *dev, unsigned flags) ++{ ++ int ret; ++ int old_flags = dev->flags; ++ ++ /* ++ * Set the flags on our device. ++ */ ++ ++ dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP | ++ IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL | ++ IFF_AUTOMEDIA)) | ++ (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC | ++ IFF_ALLMULTI)); ++ ++ /* ++ * Load in the correct multicast list now the flags have changed. ++ */ ++ ++ dev_mc_upload(dev); ++ ++ /* ++ * Have we downed the interface. We handle IFF_UP ourselves ++ * according to user attempts to set it, rather than blindly ++ * setting it. ++ */ ++ ++ ret = 0; ++ if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */ ++ ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev); ++ ++ if (!ret) ++ dev_mc_upload(dev); ++ } ++ ++ if (dev->flags & IFF_UP && ++ ((old_flags ^ dev->flags) &~ (IFF_UP | IFF_PROMISC | IFF_ALLMULTI | ++ IFF_VOLATILE))) ++ notifier_call_chain(&netdev_chain, NETDEV_CHANGE, dev); ++ ++ if ((flags ^ dev->gflags) & IFF_PROMISC) { ++ int inc = (flags & IFF_PROMISC) ? +1 : -1; ++ dev->gflags ^= IFF_PROMISC; ++ dev_set_promiscuity(dev, inc); ++ } ++ ++ /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI ++ is important. Some (broken) drivers set IFF_PROMISC, when ++ IFF_ALLMULTI is requested not asking us and not reporting. ++ */ ++ if ((flags ^ dev->gflags) & IFF_ALLMULTI) { ++ int inc = (flags & IFF_ALLMULTI) ? +1 : -1; ++ dev->gflags ^= IFF_ALLMULTI; ++ dev_set_allmulti(dev, inc); ++ } ++ ++ if (old_flags ^ dev->flags) ++ rtmsg_ifinfo(RTM_NEWLINK, dev, old_flags ^ dev->flags); ++ ++ return ret; ++} ++ ++int dev_set_mtu(struct net_device *dev, int new_mtu) ++{ ++ int err; ++ ++ if (new_mtu == dev->mtu) ++ return 0; ++ ++ /* MTU must be positive. */ ++ if (new_mtu < 0) ++ return -EINVAL; ++ ++ if (!netif_device_present(dev)) ++ return -ENODEV; ++ ++ err = 0; ++ if (dev->change_mtu) ++ err = dev->change_mtu(dev, new_mtu); ++ else ++ dev->mtu = new_mtu; ++ if (!err && dev->flags & IFF_UP) ++ notifier_call_chain(&netdev_chain, ++ NETDEV_CHANGEMTU, dev); ++ return err; ++} ++ ++int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa) ++{ ++ int err; ++ ++ if (!dev->set_mac_address) ++ return -EOPNOTSUPP; ++ if (sa->sa_family != dev->type) ++ return -EINVAL; ++ if (!netif_device_present(dev)) ++ return -ENODEV; ++ err = dev->set_mac_address(dev, sa); ++ if (!err) ++ notifier_call_chain(&netdev_chain, NETDEV_CHANGEADDR, dev); ++ return err; ++} ++ ++/* ++ * Perform the SIOCxIFxxx calls. ++ */ ++static int dev_ifsioc(struct ifreq *ifr, unsigned int cmd) ++{ ++ int err; ++ struct net_device *dev = __dev_get_by_name(ifr->ifr_name); ++ ++ if (!dev) ++ return -ENODEV; ++ ++ switch (cmd) { ++ case SIOCGIFFLAGS: /* Get interface flags */ ++ ifr->ifr_flags = dev_get_flags(dev); ++ return 0; ++ ++ case SIOCSIFFLAGS: /* Set interface flags */ ++ return dev_change_flags(dev, ifr->ifr_flags); ++ ++ case SIOCGIFMETRIC: /* Get the metric on the interface ++ (currently unused) */ ++ ifr->ifr_metric = 0; ++ return 0; ++ ++ case SIOCSIFMETRIC: /* Set the metric on the interface ++ (currently unused) */ ++ return -EOPNOTSUPP; ++ ++ case SIOCGIFMTU: /* Get the MTU of a device */ ++ ifr->ifr_mtu = dev->mtu; ++ return 0; ++ ++ case SIOCSIFMTU: /* Set the MTU of a device */ ++ return dev_set_mtu(dev, ifr->ifr_mtu); ++ ++ case SIOCGIFHWADDR: ++ if (!dev->addr_len) ++ memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data); ++ else ++ memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr, ++ min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len)); ++ ifr->ifr_hwaddr.sa_family = dev->type; ++ return 0; ++ ++ case SIOCSIFHWADDR: ++ return dev_set_mac_address(dev, &ifr->ifr_hwaddr); ++ ++ case SIOCSIFHWBROADCAST: ++ if (ifr->ifr_hwaddr.sa_family != dev->type) ++ return -EINVAL; ++ memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data, ++ min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len)); ++ notifier_call_chain(&netdev_chain, ++ NETDEV_CHANGEADDR, dev); ++ return 0; ++ ++ case SIOCGIFMAP: ++ ifr->ifr_map.mem_start = dev->mem_start; ++ ifr->ifr_map.mem_end = dev->mem_end; ++ ifr->ifr_map.base_addr = dev->base_addr; ++ ifr->ifr_map.irq = dev->irq; ++ ifr->ifr_map.dma = dev->dma; ++ ifr->ifr_map.port = dev->if_port; ++ return 0; ++ ++ case SIOCSIFMAP: ++ if (dev->set_config) { ++ if (!netif_device_present(dev)) ++ return -ENODEV; ++ return dev->set_config(dev, &ifr->ifr_map); ++ } ++ return -EOPNOTSUPP; ++ ++ case SIOCADDMULTI: ++ if (!dev->set_multicast_list || ++ ifr->ifr_hwaddr.sa_family != AF_UNSPEC) ++ return -EINVAL; ++ if (!netif_device_present(dev)) ++ return -ENODEV; ++ return dev_mc_add(dev, ifr->ifr_hwaddr.sa_data, ++ dev->addr_len, 1); ++ ++ case SIOCDELMULTI: ++ if (!dev->set_multicast_list || ++ ifr->ifr_hwaddr.sa_family != AF_UNSPEC) ++ return -EINVAL; ++ if (!netif_device_present(dev)) ++ return -ENODEV; ++ return dev_mc_delete(dev, ifr->ifr_hwaddr.sa_data, ++ dev->addr_len, 1); ++ ++ case SIOCGIFINDEX: ++ ifr->ifr_ifindex = dev->ifindex; ++ return 0; ++ ++ case SIOCGIFTXQLEN: ++ ifr->ifr_qlen = dev->tx_queue_len; ++ return 0; ++ ++ case SIOCSIFTXQLEN: ++ if (ifr->ifr_qlen < 0) ++ return -EINVAL; ++ dev->tx_queue_len = ifr->ifr_qlen; ++ return 0; ++ ++ case SIOCSIFNAME: ++ ifr->ifr_newname[IFNAMSIZ-1] = '\0'; ++ return dev_change_name(dev, ifr->ifr_newname); ++ ++ /* ++ * Unknown or private ioctl ++ */ ++ ++ default: ++ if ((cmd >= SIOCDEVPRIVATE && ++ cmd <= SIOCDEVPRIVATE + 15) || ++ cmd == SIOCBONDENSLAVE || ++ cmd == SIOCBONDRELEASE || ++ cmd == SIOCBONDSETHWADDR || ++ cmd == SIOCBONDSLAVEINFOQUERY || ++ cmd == SIOCBONDINFOQUERY || ++ cmd == SIOCBONDCHANGEACTIVE || ++ cmd == SIOCGMIIPHY || ++ cmd == SIOCGMIIREG || ++ cmd == SIOCSMIIREG || ++ cmd == SIOCBRADDIF || ++ cmd == SIOCBRDELIF || ++ cmd == SIOCWANDEV) { ++ err = -EOPNOTSUPP; ++ if (dev->do_ioctl) { ++ if (netif_device_present(dev)) ++ err = dev->do_ioctl(dev, ifr, ++ cmd); ++ else ++ err = -ENODEV; ++ } ++ } else ++ err = -EINVAL; ++ ++ } ++ return err; ++} ++ ++/* ++ * This function handles all "interface"-type I/O control requests. The actual ++ * 'doing' part of this is dev_ifsioc above. ++ */ ++ ++/** ++ * dev_ioctl - network device ioctl ++ * @cmd: command to issue ++ * @arg: pointer to a struct ifreq in user space ++ * ++ * Issue ioctl functions to devices. This is normally called by the ++ * user space syscall interfaces but can sometimes be useful for ++ * other purposes. The return value is the return from the syscall if ++ * positive or a negative errno code on error. ++ */ ++ ++int dev_ioctl(unsigned int cmd, void __user *arg) ++{ ++ struct ifreq ifr; ++ int ret; ++ char *colon; ++ ++ /* One special case: SIOCGIFCONF takes ifconf argument ++ and requires shared lock, because it sleeps writing ++ to user space. ++ */ ++ ++ if (cmd == SIOCGIFCONF) { ++ rtnl_shlock(); ++ ret = dev_ifconf((char __user *) arg); ++ rtnl_shunlock(); ++ return ret; ++ } ++ if (cmd == SIOCGIFNAME) ++ return dev_ifname((struct ifreq __user *)arg); ++ ++ if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) ++ return -EFAULT; ++ ++ ifr.ifr_name[IFNAMSIZ-1] = 0; ++ ++ colon = strchr(ifr.ifr_name, ':'); ++ if (colon) ++ *colon = 0; ++ ++ /* ++ * See which interface the caller is talking about. ++ */ ++ ++ switch (cmd) { ++ /* ++ * These ioctl calls: ++ * - can be done by all. ++ * - atomic and do not require locking. ++ * - return a value ++ */ ++ case SIOCGIFFLAGS: ++ case SIOCGIFMETRIC: ++ case SIOCGIFMTU: ++ case SIOCGIFHWADDR: ++ case SIOCGIFSLAVE: ++ case SIOCGIFMAP: ++ case SIOCGIFINDEX: ++ case SIOCGIFTXQLEN: ++ dev_load(ifr.ifr_name); ++ read_lock(&dev_base_lock); ++ ret = dev_ifsioc(&ifr, cmd); ++ read_unlock(&dev_base_lock); ++ if (!ret) { ++ if (colon) ++ *colon = ':'; ++ if (copy_to_user(arg, &ifr, ++ sizeof(struct ifreq))) ++ ret = -EFAULT; ++ } ++ return ret; ++ ++ case SIOCETHTOOL: ++ dev_load(ifr.ifr_name); ++ rtnl_lock(); ++ ret = dev_ethtool(&ifr); ++ rtnl_unlock(); ++ if (!ret) { ++ if (colon) ++ *colon = ':'; ++ if (copy_to_user(arg, &ifr, ++ sizeof(struct ifreq))) ++ ret = -EFAULT; ++ } ++ return ret; ++ ++ /* ++ * These ioctl calls: ++ * - require superuser power. ++ * - require strict serialization. ++ * - return a value ++ */ ++ case SIOCGMIIPHY: ++ case SIOCGMIIREG: ++ case SIOCSIFNAME: ++ if (!capable(CAP_NET_ADMIN)) ++ return -EPERM; ++ dev_load(ifr.ifr_name); ++ rtnl_lock(); ++ ret = dev_ifsioc(&ifr, cmd); ++ rtnl_unlock(); ++ if (!ret) { ++ if (colon) ++ *colon = ':'; ++ if (copy_to_user(arg, &ifr, ++ sizeof(struct ifreq))) ++ ret = -EFAULT; ++ } ++ return ret; ++ ++ /* ++ * These ioctl calls: ++ * - require superuser power. ++ * - require strict serialization. ++ * - do not return a value ++ */ ++ case SIOCSIFFLAGS: ++ case SIOCSIFMETRIC: ++ case SIOCSIFMTU: ++ case SIOCSIFMAP: ++ case SIOCSIFHWADDR: ++ case SIOCSIFSLAVE: ++ case SIOCADDMULTI: ++ case SIOCDELMULTI: ++ case SIOCSIFHWBROADCAST: ++ case SIOCSIFTXQLEN: ++ case SIOCSMIIREG: ++ case SIOCBONDENSLAVE: ++ case SIOCBONDRELEASE: ++ case SIOCBONDSETHWADDR: ++ case SIOCBONDSLAVEINFOQUERY: ++ case SIOCBONDINFOQUERY: ++ case SIOCBONDCHANGEACTIVE: ++ case SIOCBRADDIF: ++ case SIOCBRDELIF: ++ if (!capable(CAP_NET_ADMIN)) ++ return -EPERM; ++ dev_load(ifr.ifr_name); ++ rtnl_lock(); ++ ret = dev_ifsioc(&ifr, cmd); ++ rtnl_unlock(); ++ return ret; ++ ++ case SIOCGIFMEM: ++ /* Get the per device memory space. We can add this but ++ * currently do not support it */ ++ case SIOCSIFMEM: ++ /* Set the per device memory buffer space. ++ * Not applicable in our case */ ++ case SIOCSIFLINK: ++ return -EINVAL; ++ ++ /* ++ * Unknown or private ioctl. ++ */ ++ default: ++ if (cmd == SIOCWANDEV || ++ (cmd >= SIOCDEVPRIVATE && ++ cmd <= SIOCDEVPRIVATE + 15)) { ++ dev_load(ifr.ifr_name); ++ rtnl_lock(); ++ ret = dev_ifsioc(&ifr, cmd); ++ rtnl_unlock(); ++ if (!ret && copy_to_user(arg, &ifr, ++ sizeof(struct ifreq))) ++ ret = -EFAULT; ++ return ret; ++ } ++#ifdef WIRELESS_EXT ++ /* Take care of Wireless Extensions */ ++ if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) { ++ /* If command is `set a parameter', or ++ * `get the encoding parameters', check if ++ * the user has the right to do it */ ++ if (IW_IS_SET(cmd) || cmd == SIOCGIWENCODE) { ++ if (!capable(CAP_NET_ADMIN)) ++ return -EPERM; ++ } ++ dev_load(ifr.ifr_name); ++ rtnl_lock(); ++ /* Follow me in net/core/wireless.c */ ++ ret = wireless_process_ioctl(&ifr, cmd); ++ rtnl_unlock(); ++ if (IW_IS_GET(cmd) && ++ copy_to_user(arg, &ifr, ++ sizeof(struct ifreq))) ++ ret = -EFAULT; ++ return ret; ++ } ++#endif /* WIRELESS_EXT */ ++ return -EINVAL; ++ } ++} ++ ++ ++/** ++ * dev_new_index - allocate an ifindex ++ * ++ * Returns a suitable unique value for a new device interface ++ * number. The caller must hold the rtnl semaphore or the ++ * dev_base_lock to be sure it remains unique. ++ */ ++static int dev_new_index(void) ++{ ++ static int ifindex; ++ for (;;) { ++ if (++ifindex <= 0) ++ ifindex = 1; ++ if (!__dev_get_by_index(ifindex)) ++ return ifindex; ++ } ++} ++ ++static int dev_boot_phase = 1; ++ ++/* Delayed registration/unregisteration */ ++static DEFINE_SPINLOCK(net_todo_list_lock); ++static struct list_head net_todo_list = LIST_HEAD_INIT(net_todo_list); ++ ++static inline void net_set_todo(struct net_device *dev) ++{ ++ spin_lock(&net_todo_list_lock); ++ list_add_tail(&dev->todo_list, &net_todo_list); ++ spin_unlock(&net_todo_list_lock); ++} ++ ++/** ++ * register_netdevice - register a network device ++ * @dev: device to register ++ * ++ * Take a completed network device structure and add it to the kernel ++ * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier ++ * chain. 0 is returned on success. A negative errno code is returned ++ * on a failure to set up the device, or if the name is a duplicate. ++ * ++ * Callers must hold the rtnl semaphore. You may want ++ * register_netdev() instead of this. ++ * ++ * BUGS: ++ * The locking appears insufficient to guarantee two parallel registers ++ * will not get the same name. ++ */ ++ ++int register_netdevice(struct net_device *dev) ++{ ++ struct hlist_head *head; ++ struct hlist_node *p; ++ int ret; ++ ++ BUG_ON(dev_boot_phase); ++ ASSERT_RTNL(); ++ ++ /* When net_device's are persistent, this will be fatal. */ ++ BUG_ON(dev->reg_state != NETREG_UNINITIALIZED); ++ ++ spin_lock_init(&dev->queue_lock); ++ spin_lock_init(&dev->xmit_lock); ++ dev->xmit_lock_owner = -1; ++#ifdef CONFIG_NET_CLS_ACT ++ spin_lock_init(&dev->ingress_lock); ++#endif ++ ++ ret = alloc_divert_blk(dev); ++ if (ret) ++ goto out; ++ ++ dev->iflink = -1; ++ ++ /* Init, if this function is available */ ++ if (dev->init) { ++ ret = dev->init(dev); ++ if (ret) { ++ if (ret > 0) ++ ret = -EIO; ++ goto out_err; ++ } ++ } ++ ++ if (!dev_valid_name(dev->name)) { ++ ret = -EINVAL; ++ goto out_err; ++ } ++ ++ dev->ifindex = dev_new_index(); ++ if (dev->iflink == -1) ++ dev->iflink = dev->ifindex; ++ ++ /* Check for existence of name */ ++ head = dev_name_hash(dev->name); ++ hlist_for_each(p, head) { ++ struct net_device *d ++ = hlist_entry(p, struct net_device, name_hlist); ++ if (!strncmp(d->name, dev->name, IFNAMSIZ)) { ++ ret = -EEXIST; ++ goto out_err; ++ } ++ } ++ ++ /* Fix illegal SG+CSUM combinations. */ ++ if ((dev->features & NETIF_F_SG) && ++ !(dev->features & (NETIF_F_IP_CSUM | ++ NETIF_F_NO_CSUM | ++ NETIF_F_HW_CSUM))) { ++ printk("%s: Dropping NETIF_F_SG since no checksum feature.\n", ++ dev->name); ++ dev->features &= ~NETIF_F_SG; ++ } ++ ++ /* TSO requires that SG is present as well. */ ++ if ((dev->features & NETIF_F_TSO) && ++ !(dev->features & NETIF_F_SG)) { ++ printk("%s: Dropping NETIF_F_TSO since no SG feature.\n", ++ dev->name); ++ dev->features &= ~NETIF_F_TSO; ++ } ++ ++ /* ++ * nil rebuild_header routine, ++ * that should be never called and used as just bug trap. ++ */ ++ ++ if (!dev->rebuild_header) ++ dev->rebuild_header = default_rebuild_header; ++ ++ /* ++ * Default initial state at registry is that the ++ * device is present. ++ */ ++ ++ set_bit(__LINK_STATE_PRESENT, &dev->state); ++ ++ dev->next = NULL; ++ dev_init_scheduler(dev); ++ write_lock_bh(&dev_base_lock); ++ *dev_tail = dev; ++ dev_tail = &dev->next; ++ hlist_add_head(&dev->name_hlist, head); ++ hlist_add_head(&dev->index_hlist, dev_index_hash(dev->ifindex)); ++ dev_hold(dev); ++ dev->reg_state = NETREG_REGISTERING; ++ write_unlock_bh(&dev_base_lock); ++ ++ /* Notify protocols, that a new device appeared. */ ++ notifier_call_chain(&netdev_chain, NETDEV_REGISTER, dev); ++ ++ /* Finish registration after unlock */ ++ net_set_todo(dev); ++ ret = 0; ++ ++out: ++ return ret; ++out_err: ++ free_divert_blk(dev); ++ goto out; ++} ++ ++/** ++ * register_netdev - register a network device ++ * @dev: device to register ++ * ++ * Take a completed network device structure and add it to the kernel ++ * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier ++ * chain. 0 is returned on success. A negative errno code is returned ++ * on a failure to set up the device, or if the name is a duplicate. ++ * ++ * This is a wrapper around register_netdev that takes the rtnl semaphore ++ * and expands the device name if you passed a format string to ++ * alloc_netdev. ++ */ ++int register_netdev(struct net_device *dev) ++{ ++ int err; ++ ++ rtnl_lock(); ++ ++ /* ++ * If the name is a format string the caller wants us to do a ++ * name allocation. ++ */ ++ if (strchr(dev->name, '%')) { ++ err = dev_alloc_name(dev, dev->name); ++ if (err < 0) ++ goto out; ++ } ++ ++ /* ++ * Back compatibility hook. Kill this one in 2.5 ++ */ ++ if (dev->name[0] == 0 || dev->name[0] == ' ') { ++ err = dev_alloc_name(dev, "eth%d"); ++ if (err < 0) ++ goto out; ++ } ++ ++ err = register_netdevice(dev); ++out: ++ rtnl_unlock(); ++ return err; ++} ++EXPORT_SYMBOL(register_netdev); ++ ++/* ++ * netdev_wait_allrefs - wait until all references are gone. ++ * ++ * This is called when unregistering network devices. ++ * ++ * Any protocol or device that holds a reference should register ++ * for netdevice notification, and cleanup and put back the ++ * reference if they receive an UNREGISTER event. ++ * We can get stuck here if buggy protocols don't correctly ++ * call dev_put. ++ */ ++static void netdev_wait_allrefs(struct net_device *dev) ++{ ++ unsigned long rebroadcast_time, warning_time; ++ ++ rebroadcast_time = warning_time = jiffies; ++ while (atomic_read(&dev->refcnt) != 0) { ++ if (time_after(jiffies, rebroadcast_time + 1 * HZ)) { ++ rtnl_shlock(); ++ ++ /* Rebroadcast unregister notification */ ++ notifier_call_chain(&netdev_chain, ++ NETDEV_UNREGISTER, dev); ++ ++ if (test_bit(__LINK_STATE_LINKWATCH_PENDING, ++ &dev->state)) { ++ /* We must not have linkwatch events ++ * pending on unregister. If this ++ * happens, we simply run the queue ++ * unscheduled, resulting in a noop ++ * for this device. ++ */ ++ linkwatch_run_queue(); ++ } ++ ++ rtnl_shunlock(); ++ ++ rebroadcast_time = jiffies; ++ } ++ ++ msleep(250); ++ ++ if (time_after(jiffies, warning_time + 10 * HZ)) { ++ printk(KERN_EMERG "unregister_netdevice: " ++ "waiting for %s to become free. Usage " ++ "count = %d\n", ++ dev->name, atomic_read(&dev->refcnt)); ++ warning_time = jiffies; ++ } ++ } ++} ++ ++/* The sequence is: ++ * ++ * rtnl_lock(); ++ * ... ++ * register_netdevice(x1); ++ * register_netdevice(x2); ++ * ... ++ * unregister_netdevice(y1); ++ * unregister_netdevice(y2); ++ * ... ++ * rtnl_unlock(); ++ * free_netdev(y1); ++ * free_netdev(y2); ++ * ++ * We are invoked by rtnl_unlock() after it drops the semaphore. ++ * This allows us to deal with problems: ++ * 1) We can create/delete sysfs objects which invoke hotplug ++ * without deadlocking with linkwatch via keventd. ++ * 2) Since we run with the RTNL semaphore not held, we can sleep ++ * safely in order to wait for the netdev refcnt to drop to zero. ++ */ ++static DECLARE_MUTEX(net_todo_run_mutex); ++void netdev_run_todo(void) ++{ ++ struct list_head list = LIST_HEAD_INIT(list); ++ int err; ++ ++ ++ /* Need to guard against multiple cpu's getting out of order. */ ++ down(&net_todo_run_mutex); ++ ++ /* Not safe to do outside the semaphore. We must not return ++ * until all unregister events invoked by the local processor ++ * have been completed (either by this todo run, or one on ++ * another cpu). ++ */ ++ if (list_empty(&net_todo_list)) ++ goto out; ++ ++ /* Snapshot list, allow later requests */ ++ spin_lock(&net_todo_list_lock); ++ list_splice_init(&net_todo_list, &list); ++ spin_unlock(&net_todo_list_lock); ++ ++ while (!list_empty(&list)) { ++ struct net_device *dev ++ = list_entry(list.next, struct net_device, todo_list); ++ list_del(&dev->todo_list); ++ ++ switch(dev->reg_state) { ++ case NETREG_REGISTERING: ++ err = netdev_register_sysfs(dev); ++ if (err) ++ printk(KERN_ERR "%s: failed sysfs registration (%d)\n", ++ dev->name, err); ++ dev->reg_state = NETREG_REGISTERED; ++ break; ++ ++ case NETREG_UNREGISTERING: ++ netdev_unregister_sysfs(dev); ++ dev->reg_state = NETREG_UNREGISTERED; ++ ++ netdev_wait_allrefs(dev); ++ ++ /* paranoia */ ++ BUG_ON(atomic_read(&dev->refcnt)); ++ BUG_TRAP(!dev->ip_ptr); ++ BUG_TRAP(!dev->ip6_ptr); ++ BUG_TRAP(!dev->dn_ptr); ++ ++ ++ /* It must be the very last action, ++ * after this 'dev' may point to freed up memory. ++ */ ++ if (dev->destructor) ++ dev->destructor(dev); ++ break; ++ ++ default: ++ printk(KERN_ERR "network todo '%s' but state %d\n", ++ dev->name, dev->reg_state); ++ break; ++ } ++ } ++ ++out: ++ up(&net_todo_run_mutex); ++} ++ ++/** ++ * alloc_netdev - allocate network device ++ * @sizeof_priv: size of private data to allocate space for ++ * @name: device name format string ++ * @setup: callback to initialize device ++ * ++ * Allocates a struct net_device with private data area for driver use ++ * and performs basic initialization. ++ */ ++struct net_device *alloc_netdev(int sizeof_priv, const char *name, ++ void (*setup)(struct net_device *)) ++{ ++ void *p; ++ struct net_device *dev; ++ int alloc_size; ++ ++ /* ensure 32-byte alignment of both the device and private area */ ++ alloc_size = (sizeof(*dev) + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST; ++ alloc_size += sizeof_priv + NETDEV_ALIGN_CONST; ++ ++ p = kmalloc(alloc_size, GFP_KERNEL); ++ if (!p) { ++ printk(KERN_ERR "alloc_dev: Unable to allocate device.\n"); ++ return NULL; ++ } ++ memset(p, 0, alloc_size); ++ ++ dev = (struct net_device *) ++ (((long)p + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST); ++ dev->padded = (char *)dev - (char *)p; ++ ++ if (sizeof_priv) ++ dev->priv = netdev_priv(dev); ++ ++ setup(dev); ++ strcpy(dev->name, name); ++ return dev; ++} ++EXPORT_SYMBOL(alloc_netdev); ++ ++/** ++ * free_netdev - free network device ++ * @dev: device ++ * ++ * This function does the last stage of destroying an allocated device ++ * interface. The reference to the device object is released. ++ * If this is the last reference then it will be freed. ++ */ ++void free_netdev(struct net_device *dev) ++{ ++#ifdef CONFIG_SYSFS ++ /* Compatiablity with error handling in drivers */ ++ if (dev->reg_state == NETREG_UNINITIALIZED) { ++ kfree((char *)dev - dev->padded); ++ return; ++ } ++ ++ BUG_ON(dev->reg_state != NETREG_UNREGISTERED); ++ dev->reg_state = NETREG_RELEASED; ++ ++ /* will free via class release */ ++ class_device_put(&dev->class_dev); ++#else ++ kfree((char *)dev - dev->padded); ++#endif ++} ++ ++/* Synchronize with packet receive processing. */ ++void synchronize_net(void) ++{ ++ might_sleep(); ++ synchronize_rcu(); ++} ++ ++/** ++ * unregister_netdevice - remove device from the kernel ++ * @dev: device ++ * ++ * This function shuts down a device interface and removes it ++ * from the kernel tables. On success 0 is returned, on a failure ++ * a negative errno code is returned. ++ * ++ * Callers must hold the rtnl semaphore. You may want ++ * unregister_netdev() instead of this. ++ */ ++ ++int unregister_netdevice(struct net_device *dev) ++{ ++ struct net_device *d, **dp; ++ ++ BUG_ON(dev_boot_phase); ++ ASSERT_RTNL(); ++ ++ /* Some devices call without registering for initialization unwind. */ ++ if (dev->reg_state == NETREG_UNINITIALIZED) { ++ printk(KERN_DEBUG "unregister_netdevice: device %s/%p never " ++ "was registered\n", dev->name, dev); ++ return -ENODEV; ++ } ++ ++ BUG_ON(dev->reg_state != NETREG_REGISTERED); ++ ++ /* If device is running, close it first. */ ++ if (dev->flags & IFF_UP) ++ dev_close(dev); ++ ++ /* And unlink it from device chain. */ ++ for (dp = &dev_base; (d = *dp) != NULL; dp = &d->next) { ++ if (d == dev) { ++ write_lock_bh(&dev_base_lock); ++ hlist_del(&dev->name_hlist); ++ hlist_del(&dev->index_hlist); ++ if (dev_tail == &dev->next) ++ dev_tail = dp; ++ *dp = d->next; ++ write_unlock_bh(&dev_base_lock); ++ break; ++ } ++ } ++ if (!d) { ++ printk(KERN_ERR "unregister net_device: '%s' not found\n", ++ dev->name); ++ return -ENODEV; ++ } ++ ++ dev->reg_state = NETREG_UNREGISTERING; ++ ++ synchronize_net(); ++ ++ /* Shutdown queueing discipline. */ ++ dev_shutdown(dev); ++ ++ ++ /* Notify protocols, that we are about to destroy ++ this device. They should clean all the things. ++ */ ++ notifier_call_chain(&netdev_chain, NETDEV_UNREGISTER, dev); ++ ++ /* ++ * Flush the multicast chain ++ */ ++ dev_mc_discard(dev); ++ ++ if (dev->uninit) ++ dev->uninit(dev); ++ ++ /* Notifier chain MUST detach us from master device. */ ++ BUG_TRAP(!dev->master); ++ ++ free_divert_blk(dev); ++ ++ /* Finish processing unregister after unlock */ ++ net_set_todo(dev); ++ ++ synchronize_net(); ++ ++ dev_put(dev); ++ return 0; ++} ++ ++/** ++ * unregister_netdev - remove device from the kernel ++ * @dev: device ++ * ++ * This function shuts down a device interface and removes it ++ * from the kernel tables. On success 0 is returned, on a failure ++ * a negative errno code is returned. ++ * ++ * This is just a wrapper for unregister_netdevice that takes ++ * the rtnl semaphore. In general you want to use this and not ++ * unregister_netdevice. ++ */ ++void unregister_netdev(struct net_device *dev) ++{ ++ rtnl_lock(); ++ unregister_netdevice(dev); ++ rtnl_unlock(); ++} ++ ++EXPORT_SYMBOL(unregister_netdev); ++ ++#ifdef CONFIG_HOTPLUG_CPU ++static int dev_cpu_callback(struct notifier_block *nfb, ++ unsigned long action, ++ void *ocpu) ++{ ++ struct sk_buff **list_skb; ++ struct net_device **list_net; ++ struct sk_buff *skb; ++ unsigned int cpu, oldcpu = (unsigned long)ocpu; ++ struct softnet_data *sd, *oldsd; ++ ++ if (action != CPU_DEAD) ++ return NOTIFY_OK; ++ ++ local_irq_disable(); ++ cpu = smp_processor_id(); ++ sd = &per_cpu(softnet_data, cpu); ++ oldsd = &per_cpu(softnet_data, oldcpu); ++ ++ /* Find end of our completion_queue. */ ++ list_skb = &sd->completion_queue; ++ while (*list_skb) ++ list_skb = &(*list_skb)->next; ++ /* Append completion queue from offline CPU. */ ++ *list_skb = oldsd->completion_queue; ++ oldsd->completion_queue = NULL; ++ ++ /* Find end of our output_queue. */ ++ list_net = &sd->output_queue; ++ while (*list_net) ++ list_net = &(*list_net)->next_sched; ++ /* Append output queue from offline CPU. */ ++ *list_net = oldsd->output_queue; ++ oldsd->output_queue = NULL; ++ ++ raise_softirq_irqoff(NET_TX_SOFTIRQ); ++ local_irq_enable(); ++ ++ /* Process offline CPU's input_pkt_queue */ ++ while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) ++ netif_rx(skb); ++ ++ return NOTIFY_OK; ++} ++#endif /* CONFIG_HOTPLUG_CPU */ ++ ++ ++/* ++ * Initialize the DEV module. At boot time this walks the device list and ++ * unhooks any devices that fail to initialise (normally hardware not ++ * present) and leaves us with a valid list of present and active devices. ++ * ++ */ ++ ++/* ++ * This is called single threaded during boot, so no need ++ * to take the rtnl semaphore. ++ */ ++static int __init net_dev_init(void) ++{ ++ int i, rc = -ENOMEM; ++ ++ BUG_ON(!dev_boot_phase); ++ ++ net_random_init(); ++ ++ if (dev_proc_init()) ++ goto out; ++ ++ if (netdev_sysfs_init()) ++ goto out; ++ ++ INIT_LIST_HEAD(&ptype_all); ++ for (i = 0; i < 16; i++) ++ INIT_LIST_HEAD(&ptype_base[i]); ++ ++ for (i = 0; i < ARRAY_SIZE(dev_name_head); i++) ++ INIT_HLIST_HEAD(&dev_name_head[i]); ++ ++ for (i = 0; i < ARRAY_SIZE(dev_index_head); i++) ++ INIT_HLIST_HEAD(&dev_index_head[i]); ++ ++ /* ++ * Initialise the packet receive queues. ++ */ ++ ++ for (i = 0; i < NR_CPUS; i++) { ++ struct softnet_data *queue; ++ ++ queue = &per_cpu(softnet_data, i); ++ skb_queue_head_init(&queue->input_pkt_queue); ++ queue->throttle = 0; ++ queue->cng_level = 0; ++ queue->avg_blog = 10; /* arbitrary non-zero */ ++ queue->completion_queue = NULL; ++ INIT_LIST_HEAD(&queue->poll_list); ++ set_bit(__LINK_STATE_START, &queue->backlog_dev.state); ++ queue->backlog_dev.weight = weight_p; ++ queue->backlog_dev.poll = process_backlog; ++ atomic_set(&queue->backlog_dev.refcnt, 1); ++ } ++ ++#ifdef OFFLINE_SAMPLE ++ samp_timer.expires = jiffies + (10 * HZ); ++ add_timer(&samp_timer); ++#endif ++ ++ dev_boot_phase = 0; ++ ++ open_softirq(NET_TX_SOFTIRQ, net_tx_action, NULL); ++ open_softirq(NET_RX_SOFTIRQ, net_rx_action, NULL); ++ ++ hotcpu_notifier(dev_cpu_callback, 0); ++ dst_init(); ++ dev_mcast_init(); ++ rc = 0; ++out: ++ return rc; ++} ++ ++subsys_initcall(net_dev_init); ++ ++EXPORT_SYMBOL(__dev_get_by_index); ++EXPORT_SYMBOL(__dev_get_by_name); ++EXPORT_SYMBOL(__dev_remove_pack); ++EXPORT_SYMBOL(__skb_linearize); ++EXPORT_SYMBOL(dev_add_pack); ++EXPORT_SYMBOL(dev_alloc_name); ++EXPORT_SYMBOL(dev_close); ++EXPORT_SYMBOL(dev_get_by_flags); ++EXPORT_SYMBOL(dev_get_by_index); ++EXPORT_SYMBOL(dev_get_by_name); ++EXPORT_SYMBOL(dev_ioctl); ++EXPORT_SYMBOL(dev_open); ++EXPORT_SYMBOL(dev_queue_xmit); ++EXPORT_SYMBOL(dev_remove_pack); ++EXPORT_SYMBOL(dev_set_allmulti); ++EXPORT_SYMBOL(dev_set_promiscuity); ++EXPORT_SYMBOL(dev_change_flags); ++EXPORT_SYMBOL(dev_set_mtu); ++EXPORT_SYMBOL(dev_set_mac_address); ++EXPORT_SYMBOL(free_netdev); ++EXPORT_SYMBOL(netdev_boot_setup_check); ++EXPORT_SYMBOL(netdev_set_master); ++EXPORT_SYMBOL(netdev_state_change); ++EXPORT_SYMBOL(netif_receive_skb); ++EXPORT_SYMBOL(netif_rx); ++EXPORT_SYMBOL(register_gifconf); ++EXPORT_SYMBOL(register_netdevice); ++EXPORT_SYMBOL(register_netdevice_notifier); ++EXPORT_SYMBOL(skb_checksum_help); ++EXPORT_SYMBOL(synchronize_net); ++EXPORT_SYMBOL(unregister_netdevice); ++EXPORT_SYMBOL(unregister_netdevice_notifier); ++EXPORT_SYMBOL(net_enable_timestamp); ++EXPORT_SYMBOL(net_disable_timestamp); ++EXPORT_SYMBOL(dev_get_flags); ++ ++#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE) ++EXPORT_SYMBOL(br_handle_frame_hook); ++EXPORT_SYMBOL(br_fdb_get_hook); ++EXPORT_SYMBOL(br_fdb_put_hook); ++#endif ++ ++#ifdef CONFIG_KMOD ++EXPORT_SYMBOL(dev_load); ++#endif ++ ++EXPORT_PER_CPU_SYMBOL(softnet_data); +diff --unified --recursive --new-file linux-2.6.12.5/net/ring/Kconfig linux-2.6.12.5-1-686-smp-ring3/net/ring/Kconfig +--- linux-2.6.12.5/net/ring/Kconfig 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.12.5-1-686-smp-ring3/net/ring/Kconfig 2005-10-22 23:50:45.539482000 +0200 +@@ -0,0 +1,14 @@ ++config RING ++ tristate "PF_RING sockets (EXPERIMENTAL)" ++ depends on EXPERIMENTAL ++ ---help--- ++ PF_RING socket family, optimized for packet capture. ++ If a PF_RING socket is bound to an adapter (via the bind() system ++ call), such adapter will be used in read-only mode until the socket ++ is destroyed. Whenever an incoming packet is received from the adapter ++ it will not passed to upper layers, but instead it is copied to a ring ++ buffer, which in turn is exported to user space applications via mmap. ++ Please refer to http://luca.ntop.org/Ring.pdf for more. ++ ++ Say N unless you know what you are doing. ++ +diff --unified --recursive --new-file linux-2.6.12.5/net/ring/Makefile linux-2.6.12.5-1-686-smp-ring3/net/ring/Makefile +--- linux-2.6.12.5/net/ring/Makefile 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.12.5-1-686-smp-ring3/net/ring/Makefile 2005-10-22 23:50:45.051451500 +0200 +@@ -0,0 +1,7 @@ ++# ++# Makefile for the ring driver. ++# ++ ++obj-m += ring.o ++ ++ring-objs := ring_packet.o +diff --unified --recursive --new-file linux-2.6.12.5/net/ring/ring_packet.c linux-2.6.12.5-1-686-smp-ring3/net/ring/ring_packet.c +--- linux-2.6.12.5/net/ring/ring_packet.c 1970-01-01 01:00:00.000000000 +0100 ++++ linux-2.6.12.5-1-686-smp-ring3/net/ring/ring_packet.c 2005-10-22 23:50:45.159458250 +0200 +@@ -0,0 +1,1592 @@ ++/* ++ * ++ * (C) 2004-05 - Luca Deri <deri@ntop.org> ++ * ++ * This code includes patches courtesy of ++ * - Jeff Randall <jrandall@nexvu.com> ++ * - Helmut Manck <helmut.manck@secunet.com> ++ * - Brad Doctor <bdoctor@ps-ax.com> ++ * ++ */ ++ ++/* FIX: add an entry inside the /proc filesystem */ ++ ++#include <linux/version.h> ++#include <linux/config.h> ++#include <linux/module.h> ++#include <linux/kernel.h> ++#include <linux/socket.h> ++#include <linux/skbuff.h> ++#include <linux/rtnetlink.h> ++#include <linux/in.h> ++#include <linux/in6.h> ++#include <linux/init.h> ++#include <linux/filter.h> ++#include <linux/ring.h> ++#include <linux/ip.h> ++#include <linux/tcp.h> ++#include <linux/udp.h> ++#include <linux/list.h> ++#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)) ++#include <net/xfrm.h> ++#else ++#include <linux/poll.h> ++#endif ++#include <net/sock.h> ++#include <asm/io.h> /* needed for virt_to_phys() */ ++ ++/* #define RING_DEBUG */ ++ ++#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,11)) ++static inline int remap_page_range(struct vm_area_struct *vma, ++ unsigned long uvaddr, ++ unsigned long paddr, ++ unsigned long size, ++ pgprot_t prot) { ++ return(remap_pfn_range(vma, uvaddr, paddr >> PAGE_SHIFT, ++ size, prot)); ++} ++#endif ++ ++/* ************************************************* */ ++ ++#define CLUSTER_LEN 8 ++ ++struct ring_cluster { ++ u_short cluster_id; /* 0 = no cluster */ ++ u_short num_cluster_elements; ++ enum cluster_type hashing_mode; ++ u_short hashing_id; ++ struct sock *sk[CLUSTER_LEN]; ++ struct ring_cluster *next; /* NULL = last element of the cluster */ ++}; ++ ++/* ************************************************* */ ++ ++struct ring_element { ++ struct list_head list; ++ struct sock *sk; ++}; ++ ++/* ************************************************* */ ++ ++struct ring_opt { ++ struct net_device *ring_netdev; ++ ++ /* Cluster */ ++ u_short cluster_id; /* 0 = no cluster */ ++ ++ /* Reflector */ ++ struct net_device *reflector_dev; ++ ++ /* Packet buffers */ ++ unsigned long order; ++ ++ /* Ring Slots */ ++ unsigned long ring_memory; ++ FlowSlotInfo *slots_info; /* Basically it points to ring_memory */ ++ char *ring_slots; /* Basically it points to ring_memory ++ +sizeof(FlowSlotInfo) */ ++ ++ /* Packet Sampling */ ++ u_int pktToSample, sample_rate; ++ ++ /* BPF Filter */ ++ struct sk_filter *bpfFilter; ++ ++ /* Locks */ ++ atomic_t num_ring_slots_waiters; ++ wait_queue_head_t ring_slots_waitqueue; ++ rwlock_t ring_index_lock; ++ ++ /* Indexes (Internal) */ ++ u_int insert_page_id, insert_slot_id; ++}; ++ ++/* ************************************************* */ ++ ++/* List of all ring sockets. */ ++static struct list_head ring_table; ++ ++/* List of all clusters */ ++static struct ring_cluster *ring_cluster_list; ++ ++static rwlock_t ring_mgmt_lock = RW_LOCK_UNLOCKED; ++ ++/* ********************************** */ ++ ++/* Forward */ ++static struct proto_ops ring_ops; ++ ++#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,6,11)) ++static struct proto ring_proto; ++#endif ++ ++static int skb_ring_handler(struct sk_buff *skb, u_char recv_packet, ++ u_char real_skb); ++static int buffer_ring_handler(struct net_device *dev, char *data, int len); ++static int remove_from_cluster(struct sock *sock, struct ring_opt *pfr); ++ ++/* Extern */ ++ ++/* ********************************** */ ++ ++/* Defaults */ ++static u_int bucket_len = 128, num_slots = 4096, sample_rate = 1, ++ transparent_mode = 0, enable_tx_capture = 0; ++ ++MODULE_PARM(bucket_len, "i"); ++MODULE_PARM_DESC(bucket_len, "Number of ring buckets"); ++MODULE_PARM(num_slots, "i"); ++MODULE_PARM_DESC(num_slots, "Number of ring slots"); ++MODULE_PARM(sample_rate, "i"); ++MODULE_PARM_DESC(sample_rate, "Ring packet sample rate"); ++MODULE_PARM(transparent_mode, "i"); ++MODULE_PARM_DESC(transparent_mode, ++ "Set to 1 to set transparent mode " ++ "(slower but backwards compatible)"); ++MODULE_PARM(enable_tx_capture, "i"); ++MODULE_PARM_DESC(enable_tx_capture, "Set to 1 to capture outgoing packets"); ++ ++/* ********************************** */ ++ ++#define MIN_QUEUED_PKTS 64 ++#define MAX_QUEUE_LOOPS 64 ++ ++ ++#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)) ++#define ring_sk_datatype(__sk) ((struct ring_opt *)__sk) ++#define ring_sk(__sk) ((__sk)->sk_protinfo) ++#else ++#define ring_sk_datatype(a) (a) ++#define ring_sk(__sk) ((__sk)->protinfo.pf_ring) ++#endif ++ ++/* ++ int dev_queue_xmit(struct sk_buff *skb) ++ skb->dev; ++ struct net_device *dev_get_by_name(const char *name) ++*/ ++ ++/* ********************************** */ ++ ++static void ring_sock_destruct(struct sock *sk) { ++ ++#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)) ++ skb_queue_purge(&sk->sk_receive_queue); ++ ++ if (!sock_flag(sk, SOCK_DEAD)) { ++#if defined(RING_DEBUG) ++ printk("Attempt to release alive ring socket: %p\n", sk); ++#endif ++ return; ++ } ++ ++ BUG_TRAP(!atomic_read(&sk->sk_rmem_alloc)); ++ BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc)); ++#else ++ ++ BUG_TRAP(atomic_read(&sk->rmem_alloc)==0); ++ BUG_TRAP(atomic_read(&sk->wmem_alloc)==0); ++ ++ if (!sk->dead) { ++#if defined(RING_DEBUG) ++ printk("Attempt to release alive ring socket: %p\n", sk); ++#endif ++ return; ++ } ++#endif ++ ++ kfree(ring_sk(sk)); ++ ++#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)) ++ MOD_DEC_USE_COUNT; ++#endif ++} ++ ++/* ********************************** */ ++/* ++ * ring_insert() ++ * ++ * store the sk in a new element and add it ++ * to the head of the list. ++ */ ++static inline void ring_insert(struct sock *sk) { ++ struct ring_element *next; ++ ++#if defined(RING_DEBUG) ++ printk("RING: ring_insert()\n"); ++#endif ++ ++ next = kmalloc(sizeof(struct ring_element), GFP_ATOMIC); ++ if(next != NULL) { ++ next->sk = sk; ++ write_lock_irq(&ring_mgmt_lock); ++ list_add(&next->list, &ring_table); ++ write_unlock_irq(&ring_mgmt_lock); ++ } else { ++ if (net_ratelimit()) ++ printk("RING: could not kmalloc slot!!\n"); ++ } ++} ++ ++/* ********************************** */ ++/* ++ * ring_remove() ++ * ++ * For each of the elements in the list: ++ * - check if this is the element we want to delete ++ * - if it is, remove it from the list, and free it. ++ * ++ * stop when we find the one we're looking for (break), ++ * or when we reach the end of the list. ++ */ ++static inline void ring_remove(struct sock *sk) { ++ struct list_head *ptr; ++ struct ring_element *entry; ++ ++ ++ for(ptr = ring_table.next; ptr != &ring_table; ptr = ptr->next) { ++ entry = list_entry(ptr, struct ring_element, list); ++ ++ if(entry->sk == sk) { ++ write_lock_irq(&ring_mgmt_lock); ++ list_del(ptr); ++ kfree(ptr); ++ write_unlock_irq(&ring_mgmt_lock); ++ break; ++ } ++ } ++ ++} ++ ++/* ********************************** */ ++ ++static u_int32_t num_queued_pkts(struct ring_opt *pfr) { ++ ++ if(pfr->ring_slots != NULL) { ++ ++ u_int32_t tot_insert = pfr->slots_info->insert_idx, ++#if defined(RING_DEBUG) ++ tot_read = pfr->slots_info->tot_read, tot_pkts; ++#else ++ tot_read = pfr->slots_info->tot_read; ++#endif ++ ++ if(tot_insert >= tot_read) { ++#if defined(RING_DEBUG) ++ tot_pkts = tot_insert-tot_read; ++#endif ++ return(tot_insert-tot_read); ++ } else { ++#if defined(RING_DEBUG) ++ tot_pkts = ((u_int32_t)-1)+tot_insert-tot_read; ++#endif ++ return(((u_int32_t)-1)+tot_insert-tot_read); ++ } ++ ++#if defined(RING_DEBUG) ++ printk("-> num_queued_pkts=%d [tot_insert=%d][tot_read=%d]\n", ++ tot_pkts, tot_insert, tot_read); ++#endif ++ ++ } else ++ return(0); ++} ++ ++/* ********************************** */ ++ ++static inline FlowSlot* get_insert_slot(struct ring_opt *pfr) { ++#if defined(RING_DEBUG) ++ printk("get_insert_slot(%d)\n", pfr->slots_info->insert_idx); ++#endif ++ ++ if(pfr->ring_slots != NULL) { ++ FlowSlot *slot = (FlowSlot*)&(pfr->ring_slots[pfr->slots_info->insert_idx ++ *pfr->slots_info->slot_len]); ++ return(slot); ++ } else ++ return(NULL); ++} ++ ++/* ********************************** */ ++ ++static inline FlowSlot* get_remove_slot(struct ring_opt *pfr) { ++#if defined(RING_DEBUG) ++ printk("get_remove_slot(%d)\n", pfr->slots_info->remove_idx); ++#endif ++ ++ if(pfr->ring_slots != NULL) ++ return((FlowSlot*)&(pfr->ring_slots[pfr->slots_info->remove_idx* ++ pfr->slots_info->slot_len])); ++ else ++ return(NULL); ++} ++ ++/* ********************************** */ ++ ++static void add_skb_to_ring(struct sk_buff *skb, ++ struct ring_opt *pfr, ++ u_char recv_packet, ++ u_char real_skb /* 1=skb 0=faked skb */) { ++ FlowSlot *theSlot; ++ int idx, displ; ++ ++ if(recv_packet) { ++ /* Hack for identifying a packet received by the e1000 */ ++ if(real_skb) { ++ displ = SKB_DISPLACEMENT; ++ } else ++ displ = 0; /* Received by the e1000 wrapper */ ++ } else ++ displ = 0; ++ ++ write_lock(&pfr->ring_index_lock); ++ pfr->slots_info->tot_pkts++; ++ write_unlock(&pfr->ring_index_lock); ++ ++ /* BPF Filtering (from af_packet.c) */ ++ if(pfr->bpfFilter != NULL) { ++ unsigned res = 1, len; ++ ++ len = skb->len-skb->data_len; ++ ++ write_lock(&pfr->ring_index_lock); ++ skb->data -= displ; ++ res = sk_run_filter(skb, pfr->bpfFilter->insns, pfr->bpfFilter->len); ++ skb->data += displ; ++ write_unlock(&pfr->ring_index_lock); ++ ++ if(res == 0) { ++ /* Filter failed */ ++ ++#if defined(RING_DEBUG) ++ printk("add_skb_to_ring(skb): Filter failed [len=%d][tot=%llu]" ++ "[insertIdx=%d][pkt_type=%d][cloned=%d]\n", ++ (int)skb->len, pfr->slots_info->tot_pkts, ++ pfr->slots_info->insert_idx, ++ skb->pkt_type, skb->cloned); ++#endif ++ ++ return; ++ } ++ } ++ ++ /* ************************** */ ++ ++ if(pfr->sample_rate > 1) { ++ if(pfr->pktToSample == 0) { ++ write_lock(&pfr->ring_index_lock); ++ pfr->pktToSample = pfr->sample_rate; ++ write_unlock(&pfr->ring_index_lock); ++ } else { ++ write_lock(&pfr->ring_index_lock); ++ pfr->pktToSample--; ++ write_unlock(&pfr->ring_index_lock); ++ ++#if defined(RING_DEBUG) ++ printk("add_skb_to_ring(skb): sampled packet [len=%d]" ++ "[tot=%llu][insertIdx=%d][pkt_type=%d][cloned=%d]\n", ++ (int)skb->len, pfr->slots_info->tot_pkts, ++ pfr->slots_info->insert_idx, ++ skb->pkt_type, skb->cloned); ++#endif ++ return; ++ } ++ } ++ ++ /* ************************************* */ ++ ++ if((pfr->reflector_dev != NULL) ++ && (!netif_queue_stopped(pfr->reflector_dev))) { ++ int cpu = smp_processor_id(); ++ ++ /* increase reference counter so that this skb is not freed */ ++ atomic_inc(&skb->users); ++ ++ skb->data -= displ; ++ ++ /* send it */ ++ if (pfr->reflector_dev->xmit_lock_owner != cpu) { ++ spin_lock_bh(&pfr->reflector_dev->xmit_lock); ++ pfr->reflector_dev->xmit_lock_owner = cpu; ++ spin_unlock_bh(&pfr->reflector_dev->xmit_lock); ++ ++ if (pfr->reflector_dev->hard_start_xmit(skb, ++ pfr->reflector_dev) == 0) { ++ spin_lock_bh(&pfr->reflector_dev->xmit_lock); ++ pfr->reflector_dev->xmit_lock_owner = -1; ++ skb->data += displ; ++ spin_unlock_bh(&pfr->reflector_dev->xmit_lock); ++#if defined(RING_DEBUG) ++ printk("++ hard_start_xmit succeeded\n"); ++#endif ++ return; /* OK */ ++ } ++ ++ spin_lock_bh(&pfr->reflector_dev->xmit_lock); ++ pfr->reflector_dev->xmit_lock_owner = -1; ++ spin_unlock_bh(&pfr->reflector_dev->xmit_lock); ++ } ++ ++#if defined(RING_DEBUG) ++ printk("++ hard_start_xmit failed\n"); ++#endif ++ skb->data += displ; ++ return; /* -ENETDOWN */ ++ } ++ ++ /* ************************************* */ ++ ++#if defined(RING_DEBUG) ++ printk("add_skb_to_ring(skb) [len=%d][tot=%llu][insertIdx=%d]" ++ "[pkt_type=%d][cloned=%d]\n", ++ (int)skb->len, pfr->slots_info->tot_pkts, ++ pfr->slots_info->insert_idx, ++ skb->pkt_type, skb->cloned); ++#endif ++ ++ idx = pfr->slots_info->insert_idx; ++ theSlot = get_insert_slot(pfr); ++ ++ if((theSlot != NULL) && (theSlot->slot_state == 0)) { ++ struct pcap_pkthdr *hdr; ++ unsigned int bucketSpace; ++ char *bucket; ++ ++ /* Update Index */ ++ idx++; ++ ++ if(idx == pfr->slots_info->tot_slots) { ++ write_lock(&pfr->ring_index_lock); ++ pfr->slots_info->insert_idx = 0; ++ write_unlock(&pfr->ring_index_lock); ++ } else { ++ write_lock(&pfr->ring_index_lock); ++ pfr->slots_info->insert_idx = idx; ++ write_unlock(&pfr->ring_index_lock); ++ } ++ ++ bucketSpace = pfr->slots_info->slot_len ++#ifdef RING_MAGIC ++ - sizeof(u_char) ++#endif ++ - sizeof(u_char) /* flowSlot.slot_state */ ++ - sizeof(struct pcap_pkthdr) ++ - 1 /* 10 */ /* safe boundary */; ++ ++ bucket = &theSlot->bucket; ++ hdr = (struct pcap_pkthdr*)bucket; ++ ++ if(skb->stamp.tv_sec == 0) do_gettimeofday(&skb->stamp); ++ ++ hdr->ts.tv_sec = skb->stamp.tv_sec, hdr->ts.tv_usec = skb->stamp.tv_usec; ++ hdr->caplen = skb->len+displ; ++ ++ if(hdr->caplen > bucketSpace) ++ hdr->caplen = bucketSpace; ++ ++ hdr->len = skb->len+displ; ++ memcpy(&bucket[sizeof(struct pcap_pkthdr)], ++ skb->data-displ, hdr->caplen); ++ ++#if defined(RING_DEBUG) ++ { ++ static unsigned int lastLoss = 0; ++ ++ if(pfr->slots_info->tot_lost ++ && (lastLoss != pfr->slots_info->tot_lost)) { ++ printk("add_skb_to_ring(%d): [bucketSpace=%d]" ++ "[hdr.caplen=%d][skb->len=%d]" ++ "[pcap_pkthdr=%d][removeIdx=%d]" ++ "[loss=%lu][page=%u][slot=%u]\n", ++ idx-1, bucketSpace, hdr->caplen, skb->len, ++ sizeof(struct pcap_pkthdr), ++ pfr->slots_info->remove_idx, ++ (long unsigned int)pfr->slots_info->tot_lost, ++ pfr->insert_page_id, pfr->insert_slot_id); ++ ++ lastLoss = pfr->slots_info->tot_lost; ++ } ++ } ++#endif ++ ++ write_lock(&pfr->ring_index_lock); ++ pfr->slots_info->tot_insert++; ++ theSlot->slot_state = 1; ++ write_unlock(&pfr->ring_index_lock); ++ } else { ++ write_lock(&pfr->ring_index_lock); ++ pfr->slots_info->tot_lost++; ++ write_unlock(&pfr->ring_index_lock); ++ ++#if defined(RING_DEBUG) ++ printk("add_skb_to_ring(skb): packet lost [loss=%lu]" ++ "[removeIdx=%u][insertIdx=%u]\n", ++ (long unsigned int)pfr->slots_info->tot_lost, ++ pfr->slots_info->remove_idx, pfr->slots_info->insert_idx); ++#endif ++ } ++ ++ /* wakeup in case of poll() */ ++ if(waitqueue_active(&pfr->ring_slots_waitqueue)) ++ wake_up_interruptible(&pfr->ring_slots_waitqueue); ++} ++ ++/* ********************************** */ ++ ++static u_int hash_skb(struct ring_cluster *cluster_ptr, ++ struct sk_buff *skb, u_char recv_packet) { ++ u_int idx; ++ int displ; ++ struct iphdr *ip; ++ ++ if(cluster_ptr->hashing_mode == cluster_round_robin) { ++ idx = cluster_ptr->hashing_id++; ++ } else { ++ /* Per-flow clustering */ ++ if(skb->len > sizeof(struct iphdr)+sizeof(struct tcphdr)) { ++ if(recv_packet) ++ displ = 0; ++ else ++ displ = SKB_DISPLACEMENT; ++ ++ /* ++ skb->data+displ ++ ++ Always points to to the IP part of the packet ++ */ ++ ++ ip = (struct iphdr*)(skb->data+displ); ++ ++ idx = ip->saddr+ip->daddr+ip->protocol; ++ ++ if(ip->protocol == IPPROTO_TCP) { ++ struct tcphdr *tcp = (struct tcphdr*)(skb->data+displ ++ +sizeof(struct iphdr)); ++ idx += tcp->source+tcp->dest; ++ } else if(ip->protocol == IPPROTO_UDP) { ++ struct udphdr *udp = (struct udphdr*)(skb->data+displ ++ +sizeof(struct iphdr)); ++ idx += udp->source+udp->dest; ++ } ++ } else ++ idx = skb->len; ++ } ++ ++ return(idx % cluster_ptr->num_cluster_elements); ++} ++ ++/* ********************************** */ ++ ++static int skb_ring_handler(struct sk_buff *skb, ++ u_char recv_packet, ++ u_char real_skb /* 1=skb 0=faked skb */) { ++ struct sock *skElement; ++ int rc = 0; ++ struct list_head *ptr; ++ struct ring_cluster *cluster_ptr; ++ ++ if((!skb) /* Invalid skb */ ++ || ((!enable_tx_capture) && (!recv_packet))) { ++ /* ++ An outgoing packet is about to be sent out ++ but we decided not to handle transmitted ++ packets. ++ */ ++ return(0); ++ } ++ ++#if defined(RING_DEBUG) ++ if(0) { ++ printk("skb_ring_handler() [len=%d][dev=%s]\n", skb->len, ++ skb->dev->name == NULL ? "<NULL>" : skb->dev->name); ++ } ++#endif ++ ++ /* [1] Check unclustered sockets */ ++ for (ptr = ring_table.next; ptr != &ring_table; ptr = ptr->next) { ++ struct ring_opt *pfr; ++ struct ring_element *entry; ++ ++ entry = list_entry(ptr, struct ring_element, list); ++ ++ read_lock(&ring_mgmt_lock); ++ skElement = entry->sk; ++ pfr = ring_sk(skElement); ++ read_unlock(&ring_mgmt_lock); ++ ++ if((pfr != NULL) ++ && (pfr->cluster_id == 0 /* No cluster */) ++ && (pfr->ring_slots != NULL) ++ && (pfr->ring_netdev == skb->dev)) { ++ /* We've found the ring where the packet can be stored */ ++ read_lock(&ring_mgmt_lock); ++ add_skb_to_ring(skb, pfr, recv_packet, real_skb); ++ read_unlock(&ring_mgmt_lock); ++ ++ rc = 1; /* Ring found: we've done our job */ ++ } ++ } ++ ++ /* [2] Check socket clusters */ ++ cluster_ptr = ring_cluster_list; ++ ++ while(cluster_ptr != NULL) { ++ struct ring_opt *pfr; ++ ++ if(cluster_ptr->num_cluster_elements > 0) { ++ u_int skb_hash = hash_skb(cluster_ptr, skb, recv_packet); ++ ++ read_lock(&ring_mgmt_lock); ++ skElement = cluster_ptr->sk[skb_hash]; ++ read_unlock(&ring_mgmt_lock); ++ ++ if(skElement != NULL) { ++ pfr = ring_sk(skElement); ++ ++ if((pfr != NULL) ++ && (pfr->ring_slots != NULL) ++ && (pfr->ring_netdev == skb->dev)) { ++ /* We've found the ring where the packet can be stored */ ++ read_lock(&ring_mgmt_lock); ++ add_skb_to_ring(skb, pfr, recv_packet, real_skb); ++ read_unlock(&ring_mgmt_lock); ++ ++ rc = 1; /* Ring found: we've done our job */ ++ } ++ } ++ } ++ ++ cluster_ptr = cluster_ptr->next; ++ } ++ ++ if(transparent_mode) rc = 0; ++ ++ if((rc != 0) && real_skb) ++ dev_kfree_skb(skb); /* Free the skb */ ++ ++ return(rc); /* 0 = packet not handled */ ++} ++ ++/* ********************************** */ ++ ++struct sk_buff skb; ++ ++static int buffer_ring_handler(struct net_device *dev, ++ char *data, int len) { ++ ++#if defined(RING_DEBUG) ++ printk("buffer_ring_handler: [dev=%s][len=%d]\n", ++ dev->name == NULL ? "<NULL>" : dev->name, len); ++#endif ++ ++ skb.dev = dev, skb.len = len, skb.data = data, ++ skb.data_len = len, skb.stamp.tv_sec = 0; /* Calculate the time */ ++ ++ skb_ring_handler(&skb, 1, 0 /* fake skb */); ++ ++ return(0); ++} ++ ++/* ********************************** */ ++ ++static int ring_create(struct socket *sock, int protocol) { ++ struct sock *sk; ++ struct ring_opt *pfr; ++ int err; ++ ++#if defined(RING_DEBUG) ++ printk("RING: ring_create()\n"); ++#endif ++ ++ /* Are you root, superuser or so ? */ ++ if(!capable(CAP_NET_ADMIN)) ++ return -EPERM; ++ ++ if(sock->type != SOCK_RAW) ++ return -ESOCKTNOSUPPORT; ++ ++ if(protocol != htons(ETH_P_ALL)) ++ return -EPROTONOSUPPORT; ++ ++#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)) ++ MOD_INC_USE_COUNT; ++#endif ++ ++ err = -ENOMEM; ++ ++ // BD: -- broke this out to keep it more simple and clear as to what the ++ // options are. ++#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)) ++#if (LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,11)) ++ sk = sk_alloc(PF_RING, GFP_KERNEL, 1, NULL); ++#endif ++#endif ++ ++ // BD: API changed in 2.6.12, ref: ++ // http://svn.clkao.org/svnweb/linux/revision/?rev=28201 ++#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,6,11)) ++ sk = sk_alloc(PF_RING, GFP_ATOMIC, &ring_proto, 1); ++#endif ++ ++ if (sk == NULL) ++ goto out; ++ ++ sock->ops = &ring_ops; ++ sock_init_data(sock, sk); ++#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)) ++#if (LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,11)) ++ sk_set_owner(sk, THIS_MODULE); ++#endif ++#endif ++ ++ err = -ENOMEM; ++ ring_sk(sk) = ring_sk_datatype(kmalloc(sizeof(*pfr), GFP_KERNEL)); ++ ++ if (!(pfr = ring_sk(sk))) { ++ sk_free(sk); ++ goto out; ++ } ++ memset(pfr, 0, sizeof(*pfr)); ++ init_waitqueue_head(&pfr->ring_slots_waitqueue); ++ pfr->ring_index_lock = RW_LOCK_UNLOCKED; ++ atomic_set(&pfr->num_ring_slots_waiters, 0); ++#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)) ++ sk->sk_family = PF_RING; ++ sk->sk_destruct = ring_sock_destruct; ++#else ++ sk->family = PF_RING; ++ sk->destruct = ring_sock_destruct; ++ sk->num = protocol; ++#endif ++ ++ ring_insert(sk); ++ ++#if defined(RING_DEBUG) ++ printk("RING: ring_create() - created\n"); ++#endif ++ ++ return(0); ++ out: ++#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)) ++ MOD_DEC_USE_COUNT; ++#endif ++ return err; ++} ++ ++/* *********************************************** */ ++ ++static int ring_release(struct socket *sock) ++{ ++ struct sock *sk = sock->sk; ++ struct ring_opt *pfr = ring_sk(sk); ++ ++ if(!sk) ++ return 0; ++ ++#if defined(RING_DEBUG) ++ printk("RING: called ring_release\n"); ++#endif ++ ++#if defined(RING_DEBUG) ++ printk("RING: ring_release entered\n"); ++#endif ++ ++ ring_remove(sk); ++ ++ sock_orphan(sk); ++ sock->sk = NULL; ++ ++ /* Free the ring buffer */ ++ if(pfr->ring_memory) { ++ struct page *page, *page_end; ++ ++ page_end = virt_to_page(pfr->ring_memory + (PAGE_SIZE << pfr->order) - 1); ++ for(page = virt_to_page(pfr->ring_memory); page <= page_end; page++) ++ ClearPageReserved(page); ++ ++ free_pages(pfr->ring_memory, pfr->order); ++ } ++ ++ kfree(pfr); ++ ring_sk(sk) = NULL; ++ ++#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)) ++ skb_queue_purge(&sk->sk_write_queue); ++#endif ++ sock_put(sk); ++ ++#if defined(RING_DEBUG) ++ printk("RING: ring_release leaving\n"); ++#endif ++ ++ return 0; ++} ++ ++/* ********************************** */ ++/* ++ * We create a ring for this socket and bind it to the specified device ++ */ ++static int packet_ring_bind(struct sock *sk, struct net_device *dev) ++{ ++ u_int the_slot_len; ++ u_int32_t tot_mem; ++ struct ring_opt *pfr = ring_sk(sk); ++ struct page *page, *page_end; ++ ++ if(!dev) return(-1); ++ ++#if defined(RING_DEBUG) ++ printk("RING: packet_ring_bind(%s) called\n", dev->name); ++#endif ++ ++ /* ********************************************** ++ ++ ************************************* ++ * * ++ * FlowSlotInfo * ++ * * ++ ************************************* <-+ ++ * FlowSlot * | ++ ************************************* | ++ * FlowSlot * | ++ ************************************* +- num_slots ++ * FlowSlot * | ++ ************************************* | ++ * FlowSlot * | ++ ************************************* <-+ ++ ++ ********************************************** */ ++ ++ the_slot_len = sizeof(u_char) /* flowSlot.slot_state */ ++ + sizeof(u_short) /* flowSlot.slot_len */ ++ + bucket_len /* flowSlot.bucket */; ++ ++ tot_mem = sizeof(FlowSlotInfo) + num_slots*the_slot_len; ++ ++ /* ++ Calculate the value of the order parameter used later. ++ See http://www.linuxjournal.com/article.php?sid=1133 ++ */ ++ for(pfr->order = 0;(PAGE_SIZE << pfr->order) < tot_mem; pfr->order++) ; ++ ++ /* ++ We now try to allocate the memory as required. If we fail ++ we try to allocate a smaller amount or memory (hence a ++ smaller ring). ++ */ ++ while((pfr->ring_memory = __get_free_pages(GFP_ATOMIC, pfr->order)) == 0) ++ if(pfr->order-- == 0) ++ break; ++ ++ if(pfr->order == 0) { ++#if defined(RING_DEBUG) ++ printk("ERROR: not enough memory\n"); ++#endif ++ return(-1); ++ } else { ++#if defined(RING_DEBUG) ++ printk("RING: succesfully allocated %lu KB [tot_mem=%d][order=%ld]\n", ++ PAGE_SIZE >> (10 - pfr->order), tot_mem, pfr->order); ++#endif ++ } ++ ++ tot_mem = PAGE_SIZE << pfr->order; ++ memset((char*)pfr->ring_memory, 0, tot_mem); ++ ++ /* Now we need to reserve the pages */ ++ page_end = virt_to_page(pfr->ring_memory + (PAGE_SIZE << pfr->order) - 1); ++ for(page = virt_to_page(pfr->ring_memory); page <= page_end; page++) ++ SetPageReserved(page); ++ ++ pfr->slots_info = (FlowSlotInfo*)pfr->ring_memory; ++ pfr->ring_slots = (char*)(pfr->ring_memory+sizeof(FlowSlotInfo)); ++ ++ pfr->slots_info->version = RING_FLOWSLOT_VERSION; ++ pfr->slots_info->slot_len = the_slot_len; ++ pfr->slots_info->tot_slots = (tot_mem-sizeof(FlowSlotInfo))/the_slot_len; ++ pfr->slots_info->tot_mem = tot_mem; ++ pfr->slots_info->sample_rate = sample_rate; ++ ++#if defined(RING_DEBUG) ++ printk("RING: allocated %d slots [slot_len=%d][tot_mem=%u]\n", ++ pfr->slots_info->tot_slots, pfr->slots_info->slot_len, ++ pfr->slots_info->tot_mem); ++#endif ++ ++#ifdef RING_MAGIC ++ { ++ int i; ++ ++ for(i=0; i<pfr->slots_info->tot_slots; i++) { ++ unsigned long idx = i*pfr->slots_info->slot_len; ++ FlowSlot *slot = (FlowSlot*)&pfr->ring_slots[idx]; ++ slot->magic = RING_MAGIC_VALUE; slot->slot_state = 0; ++ } ++ } ++#endif ++ ++ pfr->insert_page_id = 1, pfr->insert_slot_id = 0; ++ ++ /* ++ IMPORTANT ++ Leave this statement here as last one. In fact when ++ the ring_netdev != NULL the socket is ready to be used. ++ */ ++ pfr->ring_netdev = dev; ++ ++ return(0); ++} ++ ++/* ************************************* */ ++ ++/* Bind to a device */ ++static int ring_bind(struct socket *sock, ++ struct sockaddr *sa, int addr_len) ++{ ++ struct sock *sk=sock->sk; ++ struct net_device *dev = NULL; ++ ++#if defined(RING_DEBUG) ++ printk("RING: ring_bind() called\n"); ++#endif ++ ++ /* ++ * Check legality ++ */ ++ if (addr_len != sizeof(struct sockaddr)) ++ return -EINVAL; ++ if (sa->sa_family != PF_RING) ++ return -EINVAL; ++ ++ /* Safety check: add trailing zero if missing */ ++ sa->sa_data[sizeof(sa->sa_data)-1] = '\0'; ++ ++#if defined(RING_DEBUG) ++ printk("RING: searching device %s\n", sa->sa_data); ++#endif ++ ++ if((dev = __dev_get_by_name(sa->sa_data)) == NULL) { ++#if defined(RING_DEBUG) ++ printk("RING: search failed\n"); ++#endif ++ return(-EINVAL); ++ } else ++ return(packet_ring_bind(sk, dev)); ++} ++ ++/* ************************************* */ ++ ++#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)) ++ ++volatile void* virt_to_kseg(volatile void* address) { ++ pte_t *pte; ++ pud_t *pud; ++ unsigned long addr = (unsigned long)address; ++ ++ pud = pud_offset(pgd_offset_k((unsigned long) address), ++ (unsigned long) address); ++ ++ /* ++ High-memory support courtesy of ++ Brad Doctor <bdoctor@ps-ax.com> ++ */ ++#if defined(CONFIG_X86_PAE) && (!defined(CONFIG_NOHIGHMEM)) ++ pte = pte_offset_map(pmd_offset(pud, addr), addr); ++#else ++ pte = pmd_offset_map(pud, addr); ++#endif ++ ++ return((volatile void*)pte_page(*pte)); ++} ++ ++#else /* 2.4 */ ++ ++/* http://www.scs.ch/~frey/linux/memorymap.html */ ++volatile void *virt_to_kseg(volatile void *address) ++{ ++ pgd_t *pgd; pmd_t *pmd; pte_t *ptep, pte; ++ unsigned long va, ret = 0UL; ++ ++ va=VMALLOC_VMADDR((unsigned long)address); ++ ++ /* get the page directory. Use the kernel memory map. */ ++ pgd = pgd_offset_k(va); ++ ++ /* check whether we found an entry */ ++ if (!pgd_none(*pgd)) ++ { ++ /* get the page middle directory */ ++ pmd = pmd_offset(pgd, va); ++ /* check whether we found an entry */ ++ if (!pmd_none(*pmd)) ++ { ++ /* get a pointer to the page table entry */ ++ ptep = pte_offset(pmd, va); ++ pte = *ptep; ++ /* check for a valid page */ ++ if (pte_present(pte)) ++ { ++ /* get the address the page is refering to */ ++ ret = (unsigned long)page_address(pte_page(pte)); ++ /* add the offset within the page to the page address */ ++ ret |= (va & (PAGE_SIZE -1)); ++ } ++ } ++ } ++ return((volatile void *)ret); ++} ++#endif ++ ++/* ************************************* */ ++ ++static int ring_mmap(struct file *file, ++ struct socket *sock, ++ struct vm_area_struct *vma) ++{ ++ struct sock *sk = sock->sk; ++ struct ring_opt *pfr = ring_sk(sk); ++ unsigned long size, start; ++ u_int pagesToMap; ++ char *ptr; ++ ++#if defined(RING_DEBUG) ++ printk("RING: ring_mmap() called\n"); ++#endif ++ ++ if(pfr->ring_memory == 0) { ++#if defined(RING_DEBUG) ++ printk("RING: ring_mmap() failed: mapping area to an unbound socket\n"); ++#endif ++ return -EINVAL; ++ } ++ ++ size = (unsigned long)(vma->vm_end-vma->vm_start); ++ ++ if(size % PAGE_SIZE) { ++#if defined(RING_DEBUG) ++ printk("RING: ring_mmap() failed: len is not multiple of PAGE_SIZE\n"); ++#endif ++ return(-EINVAL); ++ } ++ ++ /* if userspace tries to mmap beyond end of our buffer, fail */ ++ if(size > pfr->slots_info->tot_mem) { ++#if defined(RING_DEBUG) ++ printk("proc_mmap() failed: area too large [%ld > %d]\n", size, pfr->slots_info->tot_mem); ++#endif ++ return(-EINVAL); ++ } ++ ++ pagesToMap = size/PAGE_SIZE; ++ ++#if defined(RING_DEBUG) ++ printk("RING: ring_mmap() called. %d pages to map\n", pagesToMap); ++#endif ++ ++#if defined(RING_DEBUG) ++ printk("RING: mmap [slot_len=%d][tot_slots=%d] for ring on device %s\n", ++ pfr->slots_info->slot_len, pfr->slots_info->tot_slots, ++ pfr->ring_netdev->name); ++#endif ++ ++ /* we do not want to have this area swapped out, lock it */ ++ vma->vm_flags |= VM_LOCKED; ++ start = vma->vm_start; ++ ++ /* Ring slots start from page 1 (page 0 is reserved for FlowSlotInfo) */ ++ ptr = (char*)(start+PAGE_SIZE); ++ ++ if(remap_page_range( ++#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)) ++ vma, ++#endif ++ start, ++ __pa(pfr->ring_memory), ++ PAGE_SIZE*pagesToMap, vma->vm_page_prot)) { ++#if defined(RING_DEBUG) ++ printk("remap_page_range() failed\n"); ++#endif ++ return(-EAGAIN); ++ } ++ ++#if defined(RING_DEBUG) ++ printk("proc_mmap(pagesToMap=%d): success.\n", pagesToMap); ++#endif ++ ++ return 0; ++} ++ ++/* ************************************* */ ++ ++#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)) ++static int ring_recvmsg(struct kiocb *iocb, struct socket *sock, ++ struct msghdr *msg, size_t len, int flags) ++#else ++ static int ring_recvmsg(struct socket *sock, struct msghdr *msg, int len, ++ int flags, struct scm_cookie *scm) ++#endif ++{ ++ FlowSlot* slot; ++ struct ring_opt *pfr = ring_sk(sock->sk); ++ u_int32_t queued_pkts, num_loops = 0; ++ ++#if defined(RING_DEBUG) ++ printk("ring_recvmsg called\n"); ++#endif ++ ++ slot = get_remove_slot(pfr); ++ ++ while((queued_pkts = num_queued_pkts(pfr)) < MIN_QUEUED_PKTS) { ++ wait_event_interruptible(pfr->ring_slots_waitqueue, 1); ++ ++#if defined(RING_DEBUG) ++ printk("-> ring_recvmsg returning %d [queued_pkts=%d][num_loops=%d]\n", ++ slot->slot_state, queued_pkts, num_loops); ++#endif ++ ++ if(queued_pkts > 0) { ++ if(num_loops++ > MAX_QUEUE_LOOPS) ++ break; ++ } ++ } ++ ++#if defined(RING_DEBUG) ++ if(slot != NULL) ++ printk("ring_recvmsg is returning [queued_pkts=%d][num_loops=%d]\n", ++ queued_pkts, num_loops); ++#endif ++ ++ return(queued_pkts); ++} ++ ++/* ************************************* */ ++ ++unsigned int ring_poll(struct file * file, ++ struct socket *sock, poll_table *wait) ++{ ++ FlowSlot* slot; ++ struct ring_opt *pfr = ring_sk(sock->sk); ++ ++#if defined(RING_DEBUG) ++ printk("poll called\n"); ++#endif ++ ++ slot = get_remove_slot(pfr); ++ ++ if((slot != NULL) && (slot->slot_state == 0)) ++ poll_wait(file, &pfr->ring_slots_waitqueue, wait); ++ ++#if defined(RING_DEBUG) ++ printk("poll returning %d\n", slot->slot_state); ++#endif ++ ++ if((slot != NULL) && (slot->slot_state == 1)) ++ return(POLLIN | POLLRDNORM); ++ else ++ return(0); ++} ++ ++/* ************************************* */ ++ ++int add_to_cluster_list(struct ring_cluster *el, ++ struct sock *sock) { ++ ++ if(el->num_cluster_elements == CLUSTER_LEN) ++ return(-1); /* Cluster full */ ++ ++ ring_sk_datatype(ring_sk(sock))->cluster_id = el->cluster_id; ++ el->sk[el->num_cluster_elements] = sock; ++ el->num_cluster_elements++; ++ return(0); ++} ++ ++/* ************************************* */ ++ ++int remove_from_cluster_list(struct ring_cluster *el, ++ struct sock *sock) { ++ int i, j; ++ ++ for(i=0; i<CLUSTER_LEN; i++) ++ if(el->sk[i] == sock) { ++ el->num_cluster_elements--; ++ ++ if(el->num_cluster_elements > 0) { ++ /* The cluster contains other elements */ ++ for(j=i; j<CLUSTER_LEN-1; j++) ++ el->sk[j] = el->sk[j+1]; ++ ++ el->sk[CLUSTER_LEN-1] = NULL; ++ } else { ++ /* Empty cluster */ ++ memset(el->sk, 0, sizeof(el->sk)); ++ } ++ ++ return(0); ++ } ++ ++ return(-1); /* Not found */ ++} ++ ++/* ************************************* */ ++ ++static int remove_from_cluster(struct sock *sock, ++ struct ring_opt *pfr) ++{ ++ struct ring_cluster *el; ++ ++#if defined(RING_DEBUG) ++ printk("--> remove_from_cluster(%d)\n", pfr->cluster_id); ++#endif ++ ++ if(pfr->cluster_id == 0 /* 0 = No Cluster */) ++ return(0); /* Noting to do */ ++ ++ el = ring_cluster_list; ++ ++ while(el != NULL) { ++ if(el->cluster_id == pfr->cluster_id) { ++ return(remove_from_cluster_list(el, sock)); ++ } else ++ el = el->next; ++ } ++ ++ return(-EINVAL); /* Not found */ ++} ++ ++/* ************************************* */ ++ ++static int add_to_cluster(struct sock *sock, ++ struct ring_opt *pfr, ++ u_short cluster_id) ++{ ++ struct ring_cluster *el; ++ ++#ifndef RING_DEBUG ++ printk("--> add_to_cluster(%d)\n", cluster_id); ++#endif ++ ++ if(cluster_id == 0 /* 0 = No Cluster */) return(-EINVAL); ++ ++ if(pfr->cluster_id != 0) ++ remove_from_cluster(sock, pfr); ++ ++ el = ring_cluster_list; ++ ++ while(el != NULL) { ++ if(el->cluster_id == cluster_id) { ++ return(add_to_cluster_list(el, sock)); ++ } else ++ el = el->next; ++ } ++ ++ /* There's no existing cluster. We need to create one */ ++ if((el = kmalloc(sizeof(struct ring_cluster), GFP_KERNEL)) == NULL) ++ return(-ENOMEM); ++ ++ el->cluster_id = cluster_id; ++ el->num_cluster_elements = 1; ++ el->hashing_mode = cluster_per_flow; /* Default */ ++ el->hashing_id = 0; ++ ++ memset(el->sk, 0, sizeof(el->sk)); ++ el->sk[0] = sock; ++ el->next = ring_cluster_list; ++ ring_cluster_list = el; ++ pfr->cluster_id = cluster_id; ++ ++ return(0); /* 0 = OK */ ++} ++ ++/* ************************************* */ ++ ++/* Code taken/inspired from core/sock.c */ ++static int ring_setsockopt(struct socket *sock, ++ int level, int optname, ++ char *optval, int optlen) ++{ ++ struct ring_opt *pfr = ring_sk(sock->sk); ++ int val, found, ret = 0; ++ u_int cluster_id; ++ char devName[8]; ++ ++ if((optlen<sizeof(int)) || (pfr == NULL)) ++ return(-EINVAL); ++ ++ if (get_user(val, (int *)optval)) ++ return -EFAULT; ++ ++ found = 1; ++ ++ switch(optname) ++ { ++ case SO_ATTACH_FILTER: ++ ret = -EINVAL; ++ if (optlen == sizeof(struct sock_fprog)) { ++ unsigned int fsize; ++ struct sock_fprog fprog; ++ struct sk_filter *filter; ++ ++ ret = -EFAULT; ++ ++ /* ++ NOTE ++ ++ Do not call copy_from_user within a held ++ splinlock (e.g. ring_mgmt_lock) as this caused ++ problems when certain debugging was enabled under ++ 2.6.5 -- including hard lockups of the machine. ++ */ ++ if(copy_from_user(&fprog, optval, sizeof(fprog))) ++ break; ++ ++ fsize = sizeof(struct sock_filter) * fprog.len; ++ filter = kmalloc(fsize, GFP_KERNEL); ++ ++ if(filter == NULL) { ++ ret = -ENOMEM; ++ break; ++ } ++ ++ if(copy_from_user(filter->insns, fprog.filter, fsize)) ++ break; ++ ++ filter->len = fprog.len; ++ ++ if(sk_chk_filter(filter->insns, filter->len) != 0) { ++ /* Bad filter specified */ ++ kfree(filter); ++ pfr->bpfFilter = NULL; ++ break; ++ } ++ ++ /* get the lock, set the filter, release the lock */ ++ write_lock(&ring_mgmt_lock); ++ pfr->bpfFilter = filter; ++ write_unlock(&ring_mgmt_lock); ++ } ++ ret = 0; ++ break; ++ ++ case SO_DETACH_FILTER: ++ write_lock(&ring_mgmt_lock); ++ found = 1; ++ if(pfr->bpfFilter != NULL) { ++ kfree(pfr->bpfFilter); ++ pfr->bpfFilter = NULL; ++ write_unlock(&ring_mgmt_lock); ++ break; ++ } ++ ret = -ENONET; ++ break; ++ ++ case SO_ADD_TO_CLUSTER: ++ if (optlen!=sizeof(val)) ++ return -EINVAL; ++ ++ if (copy_from_user(&cluster_id, optval, sizeof(cluster_id))) ++ return -EFAULT; ++ ++ write_lock(&ring_mgmt_lock); ++ ret = add_to_cluster(sock->sk, pfr, cluster_id); ++ write_unlock(&ring_mgmt_lock); ++ break; ++ ++ case SO_REMOVE_FROM_CLUSTER: ++ write_lock(&ring_mgmt_lock); ++ ret = remove_from_cluster(sock->sk, pfr); ++ write_unlock(&ring_mgmt_lock); ++ break; ++ ++ case SO_SET_REFLECTOR: ++ if(optlen >= (sizeof(devName)-1)) ++ return -EINVAL; ++ ++ if(optlen > 0) { ++ if(copy_from_user(devName, optval, optlen)) ++ return -EFAULT; ++ } ++ ++ devName[optlen] = '\0'; ++ ++#if defined(RING_DEBUG) ++ printk("+++ SO_SET_REFLECTOR(%s)\n", devName); ++#endif ++ ++ write_lock(&ring_mgmt_lock); ++ pfr->reflector_dev = dev_get_by_name(devName); ++ write_unlock(&ring_mgmt_lock); ++ ++#if defined(RING_DEBUG) ++ if(pfr->reflector_dev != NULL) ++ printk("SO_SET_REFLECTOR(%s): succeded\n", devName); ++ else ++ printk("SO_SET_REFLECTOR(%s): device unknown\n", devName); ++#endif ++ break; ++ ++ default: ++ found = 0; ++ break; ++ } ++ ++ if(found) ++ return(ret); ++ else ++ return(sock_setsockopt(sock, level, optname, optval, optlen)); ++} ++ ++/* ************************************* */ ++ ++static int ring_ioctl(struct socket *sock, ++ unsigned int cmd, unsigned long arg) ++{ ++ switch(cmd) ++ { ++ case SIOCGIFFLAGS: ++ case SIOCSIFFLAGS: ++ case SIOCGIFCONF: ++ case SIOCGIFMETRIC: ++ case SIOCSIFMETRIC: ++ case SIOCGIFMEM: ++ case SIOCSIFMEM: ++ case SIOCGIFMTU: ++ case SIOCSIFMTU: ++ case SIOCSIFLINK: ++ case SIOCGIFHWADDR: ++ case SIOCSIFHWADDR: ++ case SIOCSIFMAP: ++ case SIOCGIFMAP: ++ case SIOCSIFSLAVE: ++ case SIOCGIFSLAVE: ++ case SIOCGIFINDEX: ++ case SIOCGIFNAME: ++ case SIOCGIFCOUNT: ++ case SIOCSIFHWBROADCAST: ++ return(dev_ioctl(cmd,(void *) arg)); ++ ++ default: ++ return -EOPNOTSUPP; ++ } ++ ++ return 0; ++} ++ ++/* ************************************* */ ++ ++static struct proto_ops ring_ops = { ++ .family = PF_RING, ++#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)) ++ .owner = THIS_MODULE, ++#endif ++ ++ /* Operations that make no sense on ring sockets. */ ++ .connect = sock_no_connect, ++ .socketpair = sock_no_socketpair, ++ .accept = sock_no_accept, ++ .getname = sock_no_getname, ++ .listen = sock_no_listen, ++ .shutdown = sock_no_shutdown, ++ .sendpage = sock_no_sendpage, ++ .sendmsg = sock_no_sendmsg, ++ .getsockopt = sock_no_getsockopt, ++ ++ /* Now the operations that really occur. */ ++ .release = ring_release, ++ .bind = ring_bind, ++ .mmap = ring_mmap, ++ .poll = ring_poll, ++ .setsockopt = ring_setsockopt, ++ .ioctl = ring_ioctl, ++ .recvmsg = ring_recvmsg, ++}; ++ ++/* ************************************ */ ++ ++static struct net_proto_family ring_family_ops = { ++ .family = PF_RING, ++ .create = ring_create, ++#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)) ++ .owner = THIS_MODULE, ++#endif ++}; ++ ++// BD: API changed in 2.6.12, ref: ++// http://svn.clkao.org/svnweb/linux/revision/?rev=28201 ++#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,6,11)) ++static struct proto ring_proto = { ++ .name = "PF_RING", ++ .owner = THIS_MODULE, ++ .obj_size = sizeof(struct sock), ++}; ++#endif ++ ++/* ************************************ */ ++ ++static void __exit ring_exit(void) ++{ ++ struct list_head *ptr; ++ struct ring_element *entry; ++ ++ for(ptr = ring_table.next; ptr != &ring_table; ptr = ptr->next) { ++ entry = list_entry(ptr, struct ring_element, list); ++ kfree(entry); ++ } ++ ++ while(ring_cluster_list != NULL) { ++ struct ring_cluster *next = ring_cluster_list->next; ++ kfree(ring_cluster_list); ++ ring_cluster_list = next; ++ } ++ ++ set_skb_ring_handler(NULL); ++ set_buffer_ring_handler(NULL); ++ sock_unregister(PF_RING); ++ ++ printk("PF_RING shut down.\n"); ++} ++ ++/* ************************************ */ ++ ++static int __init ring_init(void) ++{ ++ printk("Welcome to PF_RING %s\n(C) 2004 L.Deri <deri@ntop.org>\n", ++ RING_VERSION); ++ ++ INIT_LIST_HEAD(&ring_table); ++ ring_cluster_list = NULL; ++ ++ sock_register(&ring_family_ops); ++ ++ set_skb_ring_handler(skb_ring_handler); ++ set_buffer_ring_handler(buffer_ring_handler); ++ ++ if(get_buffer_ring_handler() != buffer_ring_handler) { ++ printk("PF_RING: set_buffer_ring_handler FAILED\n"); ++ ++ set_skb_ring_handler(NULL); ++ set_buffer_ring_handler(NULL); ++ sock_unregister(PF_RING); ++ return -1; ++ } else { ++ printk("PF_RING: bucket length %d bytes\n", bucket_len); ++ printk("PF_RING: ring slots %d\n", num_slots); ++ printk("PF_RING: sample rate %d [1=no sampling]\n", sample_rate); ++ printk("PF_RING: capture TX %s\n", ++ enable_tx_capture ? "Yes [RX+TX]" : "No [RX only]"); ++ printk("PF_RING: transparent mode %s\n", ++ transparent_mode ? "Yes" : "No"); ++ ++ printk("PF_RING initialized correctly.\n"); ++ return 0; ++ } ++} ++ ++module_init(ring_init); ++module_exit(ring_exit); ++MODULE_LICENSE("GPL"); ++ ++#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)) ++MODULE_ALIAS_NETPROTO(PF_RING); ++#endif |