diff -urN libpcap.old/pcap-int.h libpcap.dev/pcap-int.h --- libpcap.old/pcap-int.h 2003-12-15 02:42:24.000000000 +0100 +++ libpcap.dev/pcap-int.h 2005-10-22 23:20:12.220060500 +0200 @@ -30,7 +30,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * @(#) $Header: /tcpdump/master/libpcap/pcap-int.h,v 1.55.2.4 2003/12/15 01:42:24 guy Exp $ (LBL) + * @(#) $Header: /export/home/ntop/PF_RING/userland/libpcap-0.8.1-ring/pcap-int.h,v 1.2 2004/11/25 09:58:00 deri Exp $ (LBL) */ #ifndef pcap_int_h @@ -46,6 +46,8 @@ #include #endif /* WIN32 */ +#define RING /* L.Deri */ + /* * Savefile */ @@ -93,6 +95,57 @@ #endif }; +/* **************************** */ + +#ifdef RING + +#include +#include +#include +#include + +#define PAGE_SIZE 4096 + +#define HAVE_PCAP +#include +#endif + +#ifdef RING + +#define E1000_RXD_STAT_DD 0x01 /* Descriptor Done */ + +struct e1000_rx_desc { + u_int64_t buffer_addr; /* Address of the descriptor's data buffer */ + u_int16_t length; /* Length of data DMAed into data buffer */ + u_int16_t csum; /* Packet checksum */ + u_int8_t status; /* Descriptor status */ + u_int8_t errors; /* Descriptor Errors */ + u_int16_t special; +}; + +/* Transmit Descriptor */ +struct e1000_tx_desc { + u_int64_t buffer_addr; /* Address of the descriptor's data buffer */ + union { + u_int32_t data; + struct { + u_int16_t length; /* Data buffer length */ + u_int8_t cso; /* Checksum offset */ + u_int8_t cmd; /* Descriptor control */ + } flags; + } lower; + union { + u_int32_t data; + struct { + u_int8_t status; /* Descriptor status */ + u_int8_t css; /* Checksum start */ + u_int16_t special; + } fields; + } upper; +}; + +#endif + struct pcap { #ifdef WIN32 ADAPTER *adapter; @@ -121,6 +174,14 @@ u_char *bp; int cc; +#ifdef RING + /* PF_RING */ + char *ring_buffer, *ring_slots; + int ring_fd; + FlowSlotInfo *slots_info; + u_int page_id, slot_id, pkts_per_page; + u_int poll_sleep; +#endif /* * Place holder for pcap_next(). */ diff -urN libpcap.old/pcap-linux.c libpcap.dev/pcap-linux.c --- libpcap.old/pcap-linux.c 2003-11-21 11:20:46.000000000 +0100 +++ libpcap.dev/pcap-linux.c 2005-10-22 23:43:59.726120250 +0200 @@ -27,7 +27,7 @@ #ifndef lint static const char rcsid[] _U_ = - "@(#) $Header: /tcpdump/master/libpcap/pcap-linux.c,v 1.98.2.4 2003/11/21 10:20:46 guy Exp $ (LBL)"; + "@(#) $Header: /export/home/ntop/PF_RING/userland/libpcap-0.8.1-ring/pcap-linux.c,v 1.2 2004/11/25 09:58:00 deri Exp $ (LBL)"; #endif /* @@ -83,7 +83,7 @@ #ifdef HAVE_DAG_API #include "pcap-dag.h" #endif /* HAVE_DAG_API */ - + #include #include #include @@ -217,6 +217,83 @@ = { 1, &total_insn }; #endif +#define RING /* L.Deri */ +#define SAFE_RING_MODE /* + Copy the bucket in order to avoid kernel + crash if the application faults + */ + +#ifdef RING +unsigned char *write_register; +static struct pcap_stat ringStats; +u_long numPollCalls = 0, numReadCalls = 0; + +#define POLL_SLEEP_STEP 10 /* ns = 0.1 ms */ +#define POLL_SLEEP_MIN POLL_SLEEP_STEP +#define POLL_SLEEP_MAX 1000 /* ns */ +#define POLL_QUEUE_MIN_LEN 500 /* # packets */ + +#ifdef SAFE_RING_MODE +static char staticBucket[2048]; +#endif + + +/* ******************************* */ + +int pcap_set_cluster(pcap_t *handle, u_int clusterId) { + return(handle->ring_fd ? setsockopt(handle->ring_fd, 0, SO_ADD_TO_CLUSTER, + &clusterId, sizeof(clusterId)): -1); +} + +/* ******************************* */ + +int pcap_remove_from_cluster(pcap_t *handle) { + return(handle->ring_fd ? + setsockopt(handle->ring_fd, 0, SO_REMOVE_FROM_CLUSTER, NULL, 0) : -1); +} + +/* ******************************* */ + +int pcap_set_reflector(pcap_t *handle, char *reflectorDevice) { + return(handle->ring_fd ? + setsockopt(handle->ring_fd, 0, SO_SET_REFLECTOR, + &reflectorDevice, strlen(reflectorDevice)) : -1); +} + +/* ******************************* */ + +static int set_if_promisc(const char *device, int set_promisc) { + int sock_fd; + struct ifreq ifr; + + if(device == NULL) return(-3); + + sock_fd = socket(PF_PACKET, SOCK_RAW, htons(ETH_P_ALL)); + if(sock_fd <= 0) return(-1); + + memset(&ifr, 0, sizeof(ifr)); + strncpy(ifr.ifr_name, device, sizeof(ifr.ifr_name)); + if(ioctl(sock_fd, SIOCGIFFLAGS, &ifr) == -1) { + close(sock_fd); + return(-2); + } + + if(set_promisc) { + if((ifr.ifr_flags & IFF_PROMISC) == 0) ifr.ifr_flags |= IFF_PROMISC; + } else { + /* Remove promisc */ + if((ifr.ifr_flags & IFF_PROMISC) != 0) ifr.ifr_flags &= ~IFF_PROMISC; + } + + if(ioctl(sock_fd, SIOCSIFFLAGS, &ifr) == -1) + return(-1); + + close(sock_fd); + return(0); +} + +#endif + /* * Get a handle for a live capture from the given device. You can * pass NULL as device to get all packages (without link level @@ -258,6 +335,138 @@ handle->snapshot = snaplen; handle->md.timeout = to_ms; +#ifdef RING + handle->ring_fd = handle->fd = socket(PF_RING, SOCK_RAW, htons(ETH_P_ALL)); + + printf("Open RING [fd=%d]\n", handle->ring_fd); + + if(handle->ring_fd > 0) { + struct sockaddr sa; + int rc; + u_int memSlotsLen; + + err = 0; + sa.sa_family = PF_RING; + snprintf(sa.sa_data, sizeof(sa.sa_data), "%s", device); + rc = bind(handle->ring_fd, (struct sockaddr *)&sa, sizeof(sa)); + + if(rc == 0) { + + + handle->md.device = strdup(device); + handle->ring_buffer = (char *)mmap(NULL, PAGE_SIZE, + PROT_READ|PROT_WRITE, + MAP_SHARED, + handle->ring_fd, 0); + + if(handle->ring_buffer == MAP_FAILED) { + sprintf(ebuf, "mmap() failed"); + return (NULL); + } + + handle->slots_info = (FlowSlotInfo *)handle->ring_buffer; + if(handle->slots_info->version != RING_FLOWSLOT_VERSION) { + snprintf(ebuf, PCAP_ERRBUF_SIZE, "Wrong RING version: " + "kernel is %i, libpcap was compiled with %i\n", + handle->slots_info->version, RING_FLOWSLOT_VERSION); + return (NULL); + } + memSlotsLen = handle->slots_info->tot_mem; + munmap(handle->ring_buffer, PAGE_SIZE); + + handle->ring_buffer = (char *)mmap(NULL, memSlotsLen, + PROT_READ|PROT_WRITE, + MAP_SHARED, handle->ring_fd, 0); + + if(handle->ring_buffer == MAP_FAILED) { + sprintf(ebuf, "mmap() failed"); + return (NULL); + } + + handle->slots_info = (FlowSlotInfo *)handle->ring_buffer; + handle->ring_slots = (char *)(handle->ring_buffer+sizeof(FlowSlotInfo)); + + /* Safety check */ + if(handle->slots_info->remove_idx >= handle->slots_info->tot_slots) + handle->slots_info->remove_idx = 0; + + handle->page_id = PAGE_SIZE, handle->slot_id = 0, + handle->pkts_per_page = 0; + + if(0) { + int i; + + for(i=0; islots_info->tot_slots; i++) { + unsigned long idx = i*handle->slots_info->slot_len; + FlowSlot *slot = (FlowSlot*)&handle->ring_slots[idx]; + + printf("RING: Setting RING_MAGIC_VALUE into slot %d [displacement=%lu]\n", i, idx); + slot->magic = RING_MAGIC_VALUE; slot->slot_state = 0; + printf("RING: slot[%d]: magic=%d, slot_state=%d\n", + slot->magic, slot->slot_state); + } + } + + + /* Set defaults */ + handle->linktype = DLT_EN10MB; + handle->offset = 2; + + printf("RING (%s): tot_slots=%d/slot_len=%d/" + "insertIdx=%d/remove_idx=%d/dropped=%d\n", + device, + handle->slots_info->tot_slots, + handle->slots_info->slot_len, + handle->slots_info->insert_idx, + handle->slots_info->remove_idx, + handle->slots_info->tot_lost); + + ringStats.ps_recv = handle->slots_info->tot_read; + ringStats.ps_drop = handle->slots_info->tot_lost; + + if(promisc) { + struct ifreq ifr; + + err = 0; + memset(&ifr, 0, sizeof(ifr)); + strncpy(ifr.ifr_name, device, sizeof(ifr.ifr_name)); + if (ioctl(handle->fd, SIOCGIFFLAGS, &ifr) == -1) { + snprintf(ebuf, PCAP_ERRBUF_SIZE, + "ioctl: %s", pcap_strerror(errno)); + err = 1; + } + + if(err == 0) { + if ((ifr.ifr_flags & IFF_PROMISC) == 0) { + /* + * Promiscuous mode isn't currently on, + * so turn it on, and remember that + * we should turn it off when the + * pcap_t is closed. + */ + + ifr.ifr_flags |= IFF_PROMISC; + if (ioctl(handle->fd, SIOCSIFFLAGS, &ifr) == -1) { + snprintf(ebuf, PCAP_ERRBUF_SIZE, + "ioctl: %s", pcap_strerror(errno)); + err = 1; + } + } + + if(err == 0) + handle->md.clear_promisc = 1; + } + } + + if(err == 0) + goto open_open_live_final; + } + + /* Don't put 'else' above... */ + close(handle->ring_fd); + /* Continue without ring support */ + } +#endif /* * NULL and "any" are special devices which give us the hint to * monitor all devices. @@ -397,6 +606,9 @@ return NULL; } +#ifdef RING + open_open_live_final: +#endif /* * "handle->fd" is a socket, so "select()" and "poll()" * should work on it. @@ -449,6 +661,120 @@ int packet_len, caplen; struct pcap_pkthdr pcap_header; +#ifdef RING + if(handle->ring_buffer != NULL) { + u_int idx, numRuns = 0, ptrAddr; + FlowSlot *slot; + + slot = (FlowSlot*)&handle->ring_slots[handle->slots_info->remove_idx*handle->slots_info->slot_len]; + + while(1) { + u_int32_t queuedPkts; + + if(handle->slots_info->tot_insert >= handle->slots_info->tot_read) + queuedPkts = handle->slots_info->tot_insert - handle->slots_info->tot_read; + else + queuedPkts = handle->slots_info->tot_slots + handle->slots_info->tot_insert - handle->slots_info->tot_read; + + if(queuedPkts && (slot->slot_state == 1)) { + char *bucket = &slot->bucket; + +#ifdef RING_MAGIC + if(slot->magic != RING_MAGIC_VALUE) { + printf("==>> Bad Magic [remove_idx=%u][insert_idx=%u][ptrAddr=%u]\n", + handle->slots_info->remove_idx, + handle->slots_info->insert_idx, + ptrAddr); + slot->magic = RING_MAGIC_VALUE; + } +#endif + + + handle->md.stat.ps_recv++; + +#ifdef SAFE_RING_MODE + { + struct pcap_pkthdr *hdr = (struct pcap_pkthdr*)bucket; + int bktLen = hdr->caplen; + + if(bktLen > sizeof(staticBucket)) + bktLen = sizeof(staticBucket); + + memcpy(staticBucket, &bucket[sizeof(struct pcap_pkthdr)], bktLen); + +#ifdef RING_DEBUG + printf("==>> [remove_idx=%u][insert_idx=%u][ptrAddr=%u]\n", + handle->slots_info->remove_idx, + handle->slots_info->insert_idx, + ptrAddr); +#endif + + callback(userdata, hdr, staticBucket); + } +#else + callback(userdata, + (const struct pcap_pkthdr*)bucket, + (const u_char*)&bucket[sizeof(struct pcap_pkthdr)]); +#endif + + if(handle->slots_info->remove_idx >= (handle->slots_info->tot_slots-1)) { + handle->slots_info->remove_idx = 0; + handle->page_id = PAGE_SIZE, handle->slot_id = 0, handle->pkts_per_page = 0; + } else { + handle->slots_info->remove_idx++; + handle->pkts_per_page++, handle->slot_id += handle->slots_info->slot_len; + } + + handle->slots_info->tot_read++; + slot->slot_state = 0; + + return(1); + } else { + struct pollfd pfd; + int rc; + + /* Sleep when nothing is happening */ + pfd.fd = handle->ring_fd; + pfd.events = POLLIN|POLLERR; + pfd.revents = 0; + +#ifdef RING_DEBUG + printf("==>> poll [remove_idx=%u][insert_idx=%u][loss=%d][queuedPkts=%u]" + "[slot_state=%d][tot_insert=%u][tot_read=%u]\n", + handle->slots_info->remove_idx, + handle->slots_info->insert_idx, + handle->slots_info->tot_lost, + queuedPkts, slot->slot_state, + handle->slots_info->tot_insert, + handle->slots_info->tot_read); + #endif + +#ifdef RING_DEBUG + printf("==>> poll @ [remove_idx=%u][slot_id=%u]\n", handle->slots_info->remove_idx, handle->slot_id); +#endif + errno = 0; + rc = poll(&pfd, 1, -1); +#ifdef RING_DEBUG + printf("==>> poll returned %d [%s][errno=%d][break_loop=%d]\n", + rc, strerror(errno), errno, handle->break_loop); +#endif + numPollCalls++; + + if(rc == -1) { + if(errno == EINTR) { + if(handle->break_loop) { + handle->break_loop = 0; + return(-2); + } else + return(0); + } else + return(-1); + } + } + } /* while() */ + } +#endif + #ifdef HAVE_PF_PACKET_SOCKETS /* * If this is a cooked device, leave extra room for a @@ -688,6 +1014,22 @@ socklen_t len = sizeof (struct tpacket_stats); #endif +#ifdef RING + if(handle->ring_fd > 0) { + stats->ps_recv = handle->slots_info->tot_read-ringStats.ps_recv; + stats->ps_drop = handle->slots_info->tot_lost-ringStats.ps_drop; + + printf("RING: numPollCalls=%d [%.1f packets/call]\n", + numPollCalls, (float)stats->ps_recv/(float)numPollCalls); + printf("RING: [tot_pkts=%u][tot_read=%u][tot_lost=%u]\n", + handle->slots_info->tot_pkts, + handle->slots_info->tot_read, + handle->slots_info->tot_lost); + + return(0); + } +#endif + #ifdef HAVE_TPACKET_STATS /* * Try to get the packet counts from the kernel. @@ -879,6 +1221,11 @@ } } + +#ifdef RING + if(handle->ring_fd <= 0) can_filter_in_kernel = 0; +#endif + if (can_filter_in_kernel) { if ((err = set_kernel_filter(handle, &fcode)) == 0) { @@ -1348,7 +1695,7 @@ memset(&mr, 0, sizeof(mr)); mr.mr_ifindex = device_id; mr.mr_type = PACKET_MR_PROMISC; - if (setsockopt(sock_fd, SOL_PACKET, + if (setsockopt(sock_fd, 0 /* SOL_PACKET */, PACKET_ADD_MEMBERSHIP, &mr, sizeof(mr)) == -1) { snprintf(ebuf, PCAP_ERRBUF_SIZE, @@ -1425,10 +1772,11 @@ /* Any pending errors, e.g., network is down? */ - if (getsockopt(fd, SOL_SOCKET, SO_ERROR, &err, &errlen) == -1) { - snprintf(ebuf, PCAP_ERRBUF_SIZE, - "getsockopt: %s", pcap_strerror(errno)); - return -2; + if ((getsockopt(fd, PF_RING, SO_ERROR, &err, &errlen) == -1) + && (getsockopt(fd, SOL_SOCKET, SO_ERROR, &err, &errlen) == -1)) { + snprintf(ebuf, PCAP_ERRBUF_SIZE, + "getsockopt: %s", pcap_strerror(errno)); + return -2; } if (err > 0) { @@ -1482,6 +1830,13 @@ struct pcap *p, *prevp; struct ifreq ifr; +#ifdef RING + if(handle->ring_buffer != NULL) { + munmap(handle->ring_buffer, handle->slots_info->tot_mem); + handle->ring_buffer = NULL; + } +#endif + if (handle->md.clear_promisc) { /* * We put the interface into promiscuous mode; take @@ -1698,11 +2053,11 @@ } /* Any pending errors, e.g., network is down? */ - - if (getsockopt(fd, SOL_SOCKET, SO_ERROR, &err, &errlen) == -1) { - snprintf(ebuf, PCAP_ERRBUF_SIZE, - "getsockopt: %s", pcap_strerror(errno)); - return -1; + if((getsockopt(fd, PF_RING, SO_ERROR, &err, &errlen) == -1) + && (getsockopt(fd, SOL_SOCKET, SO_ERROR, &err, &errlen) == -1)) { + snprintf(ebuf, PCAP_ERRBUF_SIZE, + "getsockopt: %s", pcap_strerror(errno)); + return -1; } if (err > 0) { @@ -1924,8 +2279,11 @@ * the filtering done in userland even if it could have been * done in the kernel. */ - if (setsockopt(handle->fd, SOL_SOCKET, SO_ATTACH_FILTER, - &total_fcode, sizeof(total_fcode)) == 0) { + printf("pcap[setsockopt(%d)]\n", 0); + if (setsockopt(handle->fd, 0 /* SOL_SOCKET */, + SO_ATTACH_FILTER, + &total_fcode, + sizeof(total_fcode)) == 0) { char drain[1]; /* @@ -1933,6 +2291,9 @@ */ total_filter_on = 1; +#ifdef RING + if(!handle->ring_fd) { +#endif /* * Save the socket's current mode, and put it in * non-blocking mode; we drain it by reading packets @@ -1955,12 +2316,15 @@ return -2; } } - } +#ifdef RING + } +#endif +} /* * Now attach the new filter. */ - ret = setsockopt(handle->fd, SOL_SOCKET, SO_ATTACH_FILTER, + ret = setsockopt(handle->fd, 0 /* SOL_SOCKET */, SO_ATTACH_FILTER, fcode, sizeof(*fcode)); if (ret == -1 && total_filter_on) { /* @@ -1993,7 +2357,8 @@ /* setsockopt() barfs unless it get a dummy parameter */ int dummy; - return setsockopt(handle->fd, SOL_SOCKET, SO_DETACH_FILTER, - &dummy, sizeof(dummy)); + return setsockopt(handle->fd, handle->ring_fd > 0 ? PF_RING : SOL_SOCKET, + SO_DETACH_FILTER, + &dummy, sizeof(dummy)); } #endif