--- /dev/null
+/*
+ * IP Virtual Server
+ * data structure and functionality definitions
+ */
+
+#ifndef _IP_VS_H
+#define _IP_VS_H
+
+#include <asm/types.h> /* For __uXX types */
+
+#define IP_VS_VERSION_CODE 0x010107
+#define NVERSION(version) \
+ (version >> 16) & 0xFF, \
+ (version >> 8) & 0xFF, \
+ version & 0xFF
+
+/*
+ * Virtual Service Flags
+ */
+#define IP_VS_SVC_F_PERSISTENT 0x0001 /* persistent port */
+#define IP_VS_SVC_F_HASHED 0x0002 /* hashed entry */
+
+/*
+ * Destination Server Flags
+ */
+#define IP_VS_DEST_F_AVAILABLE 0x0001 /* server is available */
+#define IP_VS_DEST_F_OVERLOAD 0x0002 /* server is overloaded */
+
+/*
+ * IPVS sync daemon states
+ */
+#define IP_VS_STATE_NONE 0x0000 /* daemon is stopped */
+#define IP_VS_STATE_MASTER 0x0001 /* started as master */
+#define IP_VS_STATE_BACKUP 0x0002 /* started as backup */
+
+/*
+ * IPVS socket options
+ */
+#define IP_VS_BASE_CTL (64+1024+64) /* base */
+
+#define IP_VS_SO_SET_NONE IP_VS_BASE_CTL /* just peek */
+#define IP_VS_SO_SET_INSERT (IP_VS_BASE_CTL+1)
+#define IP_VS_SO_SET_ADD (IP_VS_BASE_CTL+2)
+#define IP_VS_SO_SET_EDIT (IP_VS_BASE_CTL+3)
+#define IP_VS_SO_SET_DEL (IP_VS_BASE_CTL+4)
+#define IP_VS_SO_SET_FLUSH (IP_VS_BASE_CTL+5)
+#define IP_VS_SO_SET_LIST (IP_VS_BASE_CTL+6)
+#define IP_VS_SO_SET_ADDDEST (IP_VS_BASE_CTL+7)
+#define IP_VS_SO_SET_DELDEST (IP_VS_BASE_CTL+8)
+#define IP_VS_SO_SET_EDITDEST (IP_VS_BASE_CTL+9)
+#define IP_VS_SO_SET_TIMEOUT (IP_VS_BASE_CTL+10)
+#define IP_VS_SO_SET_STARTDAEMON (IP_VS_BASE_CTL+11)
+#define IP_VS_SO_SET_STOPDAEMON (IP_VS_BASE_CTL+12)
+#define IP_VS_SO_SET_RESTORE (IP_VS_BASE_CTL+13)
+#define IP_VS_SO_SET_SAVE (IP_VS_BASE_CTL+14)
+#define IP_VS_SO_SET_ZERO (IP_VS_BASE_CTL+15)
+#define IP_VS_SO_SET_MAX IP_VS_SO_SET_ZERO
+
+#define IP_VS_SO_GET_VERSION IP_VS_BASE_CTL
+#define IP_VS_SO_GET_INFO (IP_VS_BASE_CTL+1)
+#define IP_VS_SO_GET_SERVICES (IP_VS_BASE_CTL+2)
+#define IP_VS_SO_GET_SERVICE (IP_VS_BASE_CTL+3)
+#define IP_VS_SO_GET_DESTS (IP_VS_BASE_CTL+4)
+#define IP_VS_SO_GET_DEST (IP_VS_BASE_CTL+5) /* not used now */
+#define IP_VS_SO_GET_TIMEOUT (IP_VS_BASE_CTL+6)
+#define IP_VS_SO_GET_DAEMON (IP_VS_BASE_CTL+7)
+#define IP_VS_SO_GET_MAX IP_VS_SO_GET_DAEMON
+
+
+/*
+ * IPVS Connection Flags
+ */
+#define IP_VS_CONN_F_FWD_MASK 0x0007 /* mask for the fwd methods */
+#define IP_VS_CONN_F_MASQ 0x0000 /* masquerading/NAT */
+#define IP_VS_CONN_F_LOCALNODE 0x0001 /* local node */
+#define IP_VS_CONN_F_TUNNEL 0x0002 /* tunneling */
+#define IP_VS_CONN_F_DROUTE 0x0003 /* direct routing */
+#define IP_VS_CONN_F_BYPASS 0x0004 /* cache bypass */
+#define IP_VS_CONN_F_SYNC 0x0020 /* entry created by sync */
+#define IP_VS_CONN_F_HASHED 0x0040 /* hashed entry */
+#define IP_VS_CONN_F_NOOUTPUT 0x0080 /* no output packets */
+#define IP_VS_CONN_F_INACTIVE 0x0100 /* not established */
+#define IP_VS_CONN_F_OUT_SEQ 0x0200 /* must do output seq adjust */
+#define IP_VS_CONN_F_IN_SEQ 0x0400 /* must do input seq adjust */
+#define IP_VS_CONN_F_SEQ_MASK 0x0600 /* in/out sequence mask */
+#define IP_VS_CONN_F_NO_CPORT 0x0800 /* no client port set yet */
+
+/* Move it to better place one day, for now keep it unique */
+#define NFC_IPVS_PROPERTY 0x10000
+
+#define IP_VS_SCHEDNAME_MAXLEN 16
+#define IP_VS_IFNAME_MAXLEN 16
+
+
+/*
+ * The struct ip_vs_service_user and struct ip_vs_dest_user are
+ * used to set IPVS rules through setsockopt.
+ */
+struct ip_vs_service_user {
+ /* virtual service addresses */
+ u_int16_t protocol;
+ u_int32_t addr; /* virtual ip address */
+ u_int16_t port;
+ u_int32_t fwmark; /* firwall mark of service */
+
+ /* virtual service options */
+ char sched_name[IP_VS_SCHEDNAME_MAXLEN];
+ unsigned flags; /* virtual service flags */
+ unsigned timeout; /* persistent timeout in sec */
+ u_int32_t netmask; /* persistent netmask */
+};
+
+
+struct ip_vs_dest_user {
+ /* destination server address */
+ u_int32_t addr;
+ u_int16_t port;
+
+ /* real server options */
+ unsigned conn_flags; /* connection flags */
+ int weight; /* destination weight */
+
+ /* thresholds for active connections */
+ u_int32_t u_threshold; /* upper threshold */
+ u_int32_t l_threshold; /* lower threshold */
+};
+
+
+/*
+ * IPVS statistics object (for user space)
+ */
+struct ip_vs_stats_user
+{
+ __u32 conns; /* connections scheduled */
+ __u32 inpkts; /* incoming packets */
+ __u32 outpkts; /* outgoing packets */
+ __u64 inbytes; /* incoming bytes */
+ __u64 outbytes; /* outgoing bytes */
+
+ __u32 cps; /* current connection rate */
+ __u32 inpps; /* current in packet rate */
+ __u32 outpps; /* current out packet rate */
+ __u32 inbps; /* current in byte rate */
+ __u32 outbps; /* current out byte rate */
+};
+
+
+/* The argument to IP_VS_SO_GET_INFO */
+struct ip_vs_getinfo {
+ /* version number */
+ unsigned int version;
+
+ /* size of connection hash table */
+ unsigned int size;
+
+ /* number of virtual services */
+ unsigned int num_services;
+};
+
+
+/* The argument to IP_VS_SO_GET_SERVICE */
+struct ip_vs_service_entry {
+ /* which service: user fills in these */
+ u_int16_t protocol;
+ u_int32_t addr; /* virtual address */
+ u_int16_t port;
+ u_int32_t fwmark; /* firwall mark of service */
+
+ /* service options */
+ char sched_name[IP_VS_SCHEDNAME_MAXLEN];
+ unsigned flags; /* virtual service flags */
+ unsigned timeout; /* persistent timeout */
+ u_int32_t netmask; /* persistent netmask */
+
+ /* number of real servers */
+ unsigned int num_dests;
+
+ /* statistics */
+ struct ip_vs_stats_user stats;
+};
+
+
+struct ip_vs_dest_entry {
+ u_int32_t addr; /* destination address */
+ u_int16_t port;
+ unsigned conn_flags; /* connection flags */
+ int weight; /* destination weight */
+
+ u_int32_t u_threshold; /* upper threshold */
+ u_int32_t l_threshold; /* lower threshold */
+
+ u_int32_t activeconns; /* active connections */
+ u_int32_t inactconns; /* inactive connections */
+ u_int32_t persistconns; /* persistent connections */
+
+ /* statistics */
+ struct ip_vs_stats_user stats;
+};
+
+
+/* The argument to IP_VS_SO_GET_DESTS */
+struct ip_vs_get_dests {
+ /* which service: user fills in these */
+ u_int16_t protocol;
+ u_int32_t addr; /* virtual address */
+ u_int16_t port;
+ u_int32_t fwmark; /* firwall mark of service */
+
+ /* number of real servers */
+ unsigned int num_dests;
+
+ /* the real servers */
+ struct ip_vs_dest_entry entrytable[0];
+};
+
+
+/* The argument to IP_VS_SO_GET_SERVICES */
+struct ip_vs_get_services {
+ /* number of virtual services */
+ unsigned int num_services;
+
+ /* service table */
+ struct ip_vs_service_entry entrytable[0];
+};
+
+
+/* The argument to IP_VS_SO_GET_TIMEOUT */
+struct ip_vs_timeout_user {
+ int tcp_timeout;
+ int tcp_fin_timeout;
+ int udp_timeout;
+};
+
+
+/* The argument to IP_VS_SO_GET_DAEMON */
+struct ip_vs_daemon_user {
+ /* sync daemon state (master/backup) */
+ int state;
+
+ /* multicast interface name */
+ char mcast_ifn[IP_VS_IFNAME_MAXLEN];
+
+ /* SyncID we belong to */
+ int syncid;
+};
+
+
+#ifdef __KERNEL__
+
+#include <linux/config.h>
+#include <linux/list.h> /* for struct list_head */
+#include <linux/spinlock.h> /* for struct rwlock_t */
+#include <linux/skbuff.h> /* for struct sk_buff */
+#include <linux/ip.h> /* for struct iphdr */
+#include <asm/atomic.h> /* for struct atomic_t */
+#include <linux/netdevice.h> /* for struct neighbour */
+#include <net/dst.h> /* for struct dst_entry */
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <linux/compiler.h>
+
+
+#ifdef CONFIG_IP_VS_DEBUG
+extern int ip_vs_get_debug_level(void);
+#define IP_VS_DBG(level, msg...) \
+ do { \
+ if (level <= ip_vs_get_debug_level()) \
+ printk(KERN_DEBUG "IPVS: " msg); \
+ } while (0)
+#define IP_VS_DBG_RL(msg...) \
+ do { \
+ if (net_ratelimit()) \
+ printk(KERN_DEBUG "IPVS: " msg); \
+ } while (0)
+#define IP_VS_DBG_PKT(level, pp, iph, msg) \
+ do { \
+ if (level <= ip_vs_get_debug_level()) \
+ pp->debug_packet(pp, iph, msg); \
+ } while (0)
+#define IP_VS_DBG_RL_PKT(level, pp, iph, msg) \
+ do { \
+ if (level <= ip_vs_get_debug_level() && \
+ net_ratelimit()) \
+ pp->debug_packet(pp, iph, msg); \
+ } while (0)
+#else /* NO DEBUGGING at ALL */
+#define IP_VS_DBG(level, msg...) do {} while (0)
+#define IP_VS_DBG_RL(msg...) do {} while (0)
+#define IP_VS_DBG_PKT(level, pp, iph, msg) do {} while (0)
+#define IP_VS_DBG_RL_PKT(level, pp, iph, msg) do {} while (0)
+#endif
+
+#define IP_VS_BUG() BUG()
+#define IP_VS_ERR(msg...) printk(KERN_ERR "IPVS: " msg)
+#define IP_VS_INFO(msg...) printk(KERN_INFO "IPVS: " msg)
+#define IP_VS_WARNING(msg...) \
+ printk(KERN_WARNING "IPVS: " msg)
+#define IP_VS_ERR_RL(msg...) \
+ do { \
+ if (net_ratelimit()) \
+ printk(KERN_ERR "IPVS: " msg); \
+ } while (0)
+
+#ifdef CONFIG_IP_VS_DEBUG
+#define EnterFunction(level) \
+ do { \
+ if (level <= ip_vs_get_debug_level()) \
+ printk(KERN_DEBUG "Enter: %s, %s line %i\n", \
+ __FUNCTION__, __FILE__, __LINE__); \
+ } while (0)
+#define LeaveFunction(level) \
+ do { \
+ if (level <= ip_vs_get_debug_level()) \
+ printk(KERN_DEBUG "Leave: %s, %s line %i\n", \
+ __FUNCTION__, __FILE__, __LINE__); \
+ } while (0)
+#else
+#define EnterFunction(level) do {} while (0)
+#define LeaveFunction(level) do {} while (0)
+#endif
+
+#define IP_VS_WAIT_WHILE(expr) while (expr) { cpu_relax(); }
+
+
+/*
+ * The port number of FTP service (in network order).
+ */
+#define FTPPORT __constant_htons(21)
+#define FTPDATA __constant_htons(20)
+
+/*
+ * IPVS sysctl variables under the /proc/sys/net/ipv4/vs/
+ */
+#define NET_IPV4_VS 21
+
+enum {
+ NET_IPV4_VS_DEBUG_LEVEL=1,
+ NET_IPV4_VS_AMEMTHRESH=2,
+ NET_IPV4_VS_AMDROPRATE=3,
+ NET_IPV4_VS_DROP_ENTRY=4,
+ NET_IPV4_VS_DROP_PACKET=5,
+ NET_IPV4_VS_SECURE_TCP=6,
+ NET_IPV4_VS_TO_ES=7,
+ NET_IPV4_VS_TO_SS=8,
+ NET_IPV4_VS_TO_SR=9,
+ NET_IPV4_VS_TO_FW=10,
+ NET_IPV4_VS_TO_TW=11,
+ NET_IPV4_VS_TO_CL=12,
+ NET_IPV4_VS_TO_CW=13,
+ NET_IPV4_VS_TO_LA=14,
+ NET_IPV4_VS_TO_LI=15,
+ NET_IPV4_VS_TO_SA=16,
+ NET_IPV4_VS_TO_UDP=17,
+ NET_IPV4_VS_TO_ICMP=18,
+ NET_IPV4_VS_LBLC_EXPIRE=19,
+ NET_IPV4_VS_LBLCR_EXPIRE=20,
+ NET_IPV4_VS_CACHE_BYPASS=22,
+ NET_IPV4_VS_EXPIRE_NODEST_CONN=23,
+ NET_IPV4_VS_SYNC_THRESHOLD=24,
+ NET_IPV4_VS_NAT_ICMP_SEND=25,
+ NET_IPV4_VS_LAST
+};
+
+/*
+ * TCP State Values
+ */
+enum {
+ IP_VS_TCP_S_NONE = 0,
+ IP_VS_TCP_S_ESTABLISHED,
+ IP_VS_TCP_S_SYN_SENT,
+ IP_VS_TCP_S_SYN_RECV,
+ IP_VS_TCP_S_FIN_WAIT,
+ IP_VS_TCP_S_TIME_WAIT,
+ IP_VS_TCP_S_CLOSE,
+ IP_VS_TCP_S_CLOSE_WAIT,
+ IP_VS_TCP_S_LAST_ACK,
+ IP_VS_TCP_S_LISTEN,
+ IP_VS_TCP_S_SYNACK,
+ IP_VS_TCP_S_LAST
+};
+
+/*
+ * UDP State Values
+ */
+enum {
+ IP_VS_UDP_S_NORMAL,
+ IP_VS_UDP_S_LAST,
+};
+
+/*
+ * ICMP State Values
+ */
+enum {
+ IP_VS_ICMP_S_NORMAL,
+ IP_VS_ICMP_S_LAST,
+};
+
+/*
+ * Transport protocol header
+ */
+union ip_vs_tphdr {
+ unsigned char *raw;
+ struct udphdr *uh;
+ struct tcphdr *th;
+ struct icmphdr *icmph;
+ __u16 *portp;
+};
+
+
+/*
+ * Slow timer for IPVS connections
+ */
+struct sltimer_list {
+ struct list_head list;
+ unsigned long expires;
+ unsigned long data;
+ void (*function)(unsigned long);
+};
+
+
+/*
+ * Delta sequence info structure
+ * Each ip_vs_conn has 2 (output AND input seq. changes).
+ * Only used in the VS/NAT.
+ */
+struct ip_vs_seq {
+ __u32 init_seq; /* Add delta from this seq */
+ __u32 delta; /* Delta in sequence numbers */
+ __u32 previous_delta; /* Delta in sequence numbers
+ before last resized pkt */
+};
+
+
+/*
+ * IPVS statistics object
+ */
+struct ip_vs_stats
+{
+ __u32 conns; /* connections scheduled */
+ __u32 inpkts; /* incoming packets */
+ __u32 outpkts; /* outgoing packets */
+ __u64 inbytes; /* incoming bytes */
+ __u64 outbytes; /* outgoing bytes */
+
+ __u32 cps; /* current connection rate */
+ __u32 inpps; /* current in packet rate */
+ __u32 outpps; /* current out packet rate */
+ __u32 inbps; /* current in byte rate */
+ __u32 outbps; /* current out byte rate */
+
+ spinlock_t lock; /* spin lock */
+};
+
+struct ip_vs_conn;
+struct ip_vs_app;
+
+#define CONFIG_IP_VS_PROTO_UDP
+#define CONFIG_IP_VS_PROTO_TCP
+#define CONFIG_IP_VS_PROTO_ICMP
+#define CONFIG_IP_VS_PROTO_ESP
+#define CONFIG_IP_VS_PROTO_AH
+
+struct ip_vs_protocol {
+ struct ip_vs_protocol *next;
+ char *name;
+ __u16 protocol;
+ int minhlen;
+ int minhlen_icmp;
+ int dont_defrag;
+ int skip_nonexisting;
+ int slave; /* if controlled by others */
+ atomic_t appcnt; /* counter of proto app incs */
+ int *timeout_table; /* protocol timeout table */
+
+ void (*init)(struct ip_vs_protocol *pp);
+
+ void (*exit)(struct ip_vs_protocol *pp);
+
+ int (*conn_schedule)(struct sk_buff *skb, struct ip_vs_protocol *pp,
+ struct iphdr *iph, union ip_vs_tphdr h,
+ int *verdict, struct ip_vs_conn **cpp);
+
+ struct ip_vs_conn *
+ (*conn_in_get)(struct sk_buff *skb,
+ struct ip_vs_protocol *pp, struct iphdr *iph,
+ union ip_vs_tphdr h, int inverse);
+
+ struct ip_vs_conn *
+ (*conn_out_get)(struct sk_buff *skb,
+ struct ip_vs_protocol *pp, struct iphdr *iph,
+ union ip_vs_tphdr h, int inverse);
+
+ int (*snat_handler)(struct sk_buff *skb,
+ struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
+ struct iphdr *iph, union ip_vs_tphdr h, int size);
+
+ int (*dnat_handler)(struct sk_buff *skb,
+ struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
+ struct iphdr *iph, union ip_vs_tphdr h, int size);
+
+ int (*csum_check)(struct sk_buff *skb,
+ struct ip_vs_protocol *pp, struct iphdr *iph,
+ union ip_vs_tphdr h, int size);
+
+ const char *(*state_name)(int state);
+
+ int (*state_transition)(struct ip_vs_conn *cp, int direction,
+ struct iphdr *iph, union ip_vs_tphdr h,
+ struct ip_vs_protocol *pp);
+
+ int (*register_app)(struct ip_vs_app *inc);
+
+ void (*unregister_app)(struct ip_vs_app *inc);
+
+ int (*app_conn_bind)(struct ip_vs_conn *cp);
+
+ void (*debug_packet)(struct ip_vs_protocol *pp, struct iphdr *iph,
+ char *msg);
+
+ void (*timeout_change)(struct ip_vs_protocol *pp, int flags);
+
+ int (*set_state_timeout)(struct ip_vs_protocol *pp, char *sname, int to);
+};
+
+extern struct ip_vs_protocol * ip_vs_proto_get(unsigned short proto);
+
+/*
+ * IP_VS structure allocated for each dynamically scheduled connection
+ */
+struct ip_vs_conn {
+ struct list_head c_list; /* hashed list heads */
+
+ /* Protocol, addresses and port numbers */
+ __u32 caddr; /* client address */
+ __u32 vaddr; /* virtual address */
+ __u32 daddr; /* destination address */
+ __u16 cport;
+ __u16 vport;
+ __u16 dport;
+ __u16 protocol; /* Which protocol (TCP/UDP) */
+
+ /* counter and timer */
+ atomic_t refcnt; /* reference count */
+ struct sltimer_list timer; /* Expiration timer */
+ volatile unsigned long timeout; /* timeout */
+
+ /* Flags and state transition */
+ spinlock_t lock; /* lock for state transition */
+ volatile __u16 flags; /* status flags */
+ volatile __u16 state; /* state info */
+
+ /* Control members */
+ struct ip_vs_conn *control; /* Master control connection */
+ atomic_t n_control; /* Number of controlled ones */
+ struct ip_vs_dest *dest; /* real server */
+ atomic_t in_pkts; /* incoming packet counter */
+
+ /* packet transmitter for different forwarding methods */
+ int (*packet_xmit)(struct sk_buff *skb, struct ip_vs_conn *cp,
+ struct ip_vs_protocol *pp);
+
+ /* Note: we can group the following members into a structure,
+ in order to save more space, and the following members are
+ only used in VS/NAT anyway */
+ struct ip_vs_app *app; /* bound ip_vs_app object */
+ void *app_data; /* Application private data */
+ struct ip_vs_seq in_seq; /* incoming seq. struct */
+ struct ip_vs_seq out_seq; /* outgoing seq. struct */
+};
+
+
+/*
+ * The information about the virtual service offered to the net
+ * and the forwarding entries
+ */
+struct ip_vs_service {
+ struct list_head s_list; /* for normal service table */
+ struct list_head f_list; /* for fwmark-based service table */
+ atomic_t refcnt; /* reference counter */
+ atomic_t usecnt; /* use counter */
+
+ __u16 protocol; /* which protocol (TCP/UDP) */
+ __u32 addr; /* IP address for virtual service */
+ __u16 port; /* port number for the service */
+ __u32 fwmark; /* firewall mark of the service */
+ unsigned flags; /* service status flags */
+ unsigned timeout; /* persistent timeout in ticks */
+ __u32 netmask; /* grouping granularity */
+
+ struct list_head destinations; /* real server d-linked list */
+ __u32 num_dests; /* number of servers */
+ struct ip_vs_stats stats; /* statistics for the service */
+ struct ip_vs_app *inc; /* bind conns to this app inc */
+
+ /* for scheduling */
+ struct ip_vs_scheduler *scheduler; /* bound scheduler object */
+ rwlock_t sched_lock; /* lock sched_data */
+ void *sched_data; /* scheduler application data */
+};
+
+
+/*
+ * The real server destination forwarding entry
+ * with ip address, port number, and so on.
+ */
+struct ip_vs_dest {
+ struct list_head n_list; /* for the dests in the service */
+ struct list_head d_list; /* for table with all the dests */
+
+ __u32 addr; /* IP address of the server */
+ __u16 port; /* port number of the server */
+ volatile unsigned flags; /* dest status flags */
+ atomic_t conn_flags; /* flags to copy to conn */
+ atomic_t weight; /* server weight */
+
+ atomic_t refcnt; /* reference counter */
+ struct ip_vs_stats stats; /* statistics */
+
+ /* connection counters and thresholds */
+ atomic_t activeconns; /* active connections */
+ atomic_t inactconns; /* inactive connections */
+ atomic_t persistconns; /* persistent connections */
+ __u32 u_threshold; /* upper threshold */
+ __u32 l_threshold; /* lower threshold */
+
+ /* for destination cache */
+ spinlock_t dst_lock; /* lock of dst_cache */
+ struct dst_entry *dst_cache; /* destination cache entry */
+ u32 dst_rtos; /* RT_TOS(tos) for dst */
+
+ /* for virtual service */
+ struct ip_vs_service *svc; /* service it belongs to */
+ __u16 protocol; /* which protocol (TCP/UDP) */
+ __u32 vaddr; /* virtual IP address */
+ __u16 vport; /* virtual port number */
+ __u32 vfwmark; /* firewall mark of service */
+};
+
+
+/*
+ * The scheduler object
+ */
+struct ip_vs_scheduler {
+ struct list_head n_list; /* d-linked list head */
+ char *name; /* scheduler name */
+ atomic_t refcnt; /* reference counter */
+ struct module *module; /* THIS_MODULE/NULL */
+
+ /* scheduler initializing service */
+ int (*init_service)(struct ip_vs_service *svc);
+ /* scheduling service finish */
+ int (*done_service)(struct ip_vs_service *svc);
+ /* scheduler updating service */
+ int (*update_service)(struct ip_vs_service *svc);
+
+ /* selecting a server from the given service */
+ struct ip_vs_dest* (*schedule)(struct ip_vs_service *svc,
+ struct iphdr *iph);
+};
+
+
+/*
+ * The application module object (a.k.a. app incarnation)
+ */
+struct ip_vs_app
+{
+ struct list_head a_list; /* member in app list */
+ int type; /* IP_VS_APP_TYPE_xxx */
+ char *name; /* application module name */
+ __u16 protocol;
+ struct module *module; /* THIS_MODULE/NULL */
+ struct list_head incs_list; /* list of incarnations */
+
+ /* members for application incarnations */
+ struct list_head p_list; /* member in proto app list */
+ struct ip_vs_app *app; /* its real application */
+ __u16 port; /* port number in net order */
+ atomic_t usecnt; /* usage counter */
+
+ /* output hook */
+ int (*pkt_out)(struct ip_vs_app *, struct ip_vs_conn *,
+ struct sk_buff *);
+
+ /* input hook */
+ int (*pkt_in)(struct ip_vs_app *, struct ip_vs_conn *,
+ struct sk_buff *);
+
+ /* ip_vs_app initializer */
+ int (*init_conn)(struct ip_vs_app *, struct ip_vs_conn *);
+
+ /* ip_vs_app finish */
+ int (*done_conn)(struct ip_vs_app *, struct ip_vs_conn *);
+
+
+ /* not used now */
+ int (*bind_conn)(struct ip_vs_app *, struct ip_vs_conn *,
+ struct ip_vs_protocol *);
+
+ void (*unbind_conn)(struct ip_vs_app *, struct ip_vs_conn *);
+
+ int * timeout_table;
+ int * timeouts;
+ int timeouts_size;
+
+ int (*conn_schedule)(struct sk_buff *skb, struct ip_vs_app *app,
+ struct iphdr *iph, union ip_vs_tphdr h,
+ int *verdict, struct ip_vs_conn **cpp);
+
+ struct ip_vs_conn *
+ (*conn_in_get)(struct sk_buff *skb, struct ip_vs_app *app,
+ struct iphdr *iph, union ip_vs_tphdr h, int inverse);
+
+ struct ip_vs_conn *
+ (*conn_out_get)(struct sk_buff *skb, struct ip_vs_app *app,
+ struct iphdr *iph, union ip_vs_tphdr h, int inverse);
+
+ int (*state_transition)(struct ip_vs_conn *cp, int direction,
+ struct iphdr *iph,
+ union ip_vs_tphdr h, struct ip_vs_app *app);
+
+ void (*timeout_change)(struct ip_vs_app *app, int flags);
+};
+
+
+/*
+ * IPVS core functions
+ * (from ip_vs_core.c)
+ */
+extern const char *ip_vs_proto_name(unsigned proto);
+extern unsigned int check_for_ip_vs_out(struct sk_buff **skb_p,
+ int (*okfn)(struct sk_buff *));
+extern void ip_vs_init_hash_table(struct list_head *table, int rows);
+#define IP_VS_INIT_HASH_TABLE(t) ip_vs_init_hash_table(t, sizeof(t)/sizeof(t[0]))
+
+#define IP_VS_APP_TYPE_UNSPEC 0
+#define IP_VS_APP_TYPE_FTP 1
+
+/*
+ * ip_vs_conn handling functions
+ * (from ip_vs_conn.c)
+ */
+
+/*
+ * IPVS connection entry hash table
+ */
+#ifndef CONFIG_IP_VS_TAB_BITS
+#define CONFIG_IP_VS_TAB_BITS 12
+#endif
+/* make sure that IP_VS_CONN_TAB_BITS is located in [8, 20] */
+#if CONFIG_IP_VS_TAB_BITS < 8
+#define IP_VS_CONN_TAB_BITS 8
+#endif
+#if CONFIG_IP_VS_TAB_BITS > 20
+#define IP_VS_CONN_TAB_BITS 20
+#endif
+#if 8 <= CONFIG_IP_VS_TAB_BITS && CONFIG_IP_VS_TAB_BITS <= 20
+#define IP_VS_CONN_TAB_BITS CONFIG_IP_VS_TAB_BITS
+#endif
+#define IP_VS_CONN_TAB_SIZE (1 << IP_VS_CONN_TAB_BITS)
+#define IP_VS_CONN_TAB_MASK (IP_VS_CONN_TAB_SIZE - 1)
+
+enum {
+ IP_VS_DIR_INPUT = 0,
+ IP_VS_DIR_OUTPUT,
+ IP_VS_DIR_INPUT_ONLY,
+ IP_VS_DIR_LAST,
+};
+
+extern struct ip_vs_conn *ip_vs_conn_in_get
+(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port);
+extern struct ip_vs_conn *ip_vs_conn_out_get
+(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port);
+
+/* put back the conn without restarting its timer */
+static inline void __ip_vs_conn_put(struct ip_vs_conn *cp)
+{
+ atomic_dec(&cp->refcnt);
+}
+extern void ip_vs_conn_put(struct ip_vs_conn *cp);
+extern void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __u16 cport);
+
+extern struct ip_vs_conn *
+ip_vs_conn_new(int proto, __u32 caddr, __u16 cport, __u32 vaddr, __u16 vport,
+ __u32 daddr, __u16 dport, unsigned flags,
+ struct ip_vs_dest *dest);
+extern void ip_vs_conn_expire_now(struct ip_vs_conn *cp);
+
+extern const char * ip_vs_state_name(__u16 proto, int state);
+
+extern void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp);
+extern int ip_vs_check_template(struct ip_vs_conn *ct);
+extern void ip_vs_secure_tcp_set(int on);
+extern void ip_vs_random_dropentry(void);
+extern int ip_vs_conn_init(void);
+extern void ip_vs_conn_cleanup(void);
+
+static inline void ip_vs_control_del(struct ip_vs_conn *cp)
+{
+ struct ip_vs_conn *ctl_cp = cp->control;
+ if (!ctl_cp) {
+ IP_VS_ERR("request control DEL for uncontrolled: "
+ "%d.%d.%d.%d:%d to %d.%d.%d.%d:%d\n",
+ NIPQUAD(cp->caddr),ntohs(cp->cport),
+ NIPQUAD(cp->vaddr),ntohs(cp->vport));
+ return;
+ }
+
+ IP_VS_DBG(7, "DELeting control for: "
+ "cp.dst=%d.%d.%d.%d:%d ctl_cp.dst=%d.%d.%d.%d:%d\n",
+ NIPQUAD(cp->caddr),ntohs(cp->cport),
+ NIPQUAD(ctl_cp->caddr),ntohs(ctl_cp->cport));
+
+ cp->control = NULL;
+ if (atomic_read(&ctl_cp->n_control) == 0) {
+ IP_VS_ERR("BUG control DEL with n=0 : "
+ "%d.%d.%d.%d:%d to %d.%d.%d.%d:%d\n",
+ NIPQUAD(cp->caddr),ntohs(cp->cport),
+ NIPQUAD(cp->vaddr),ntohs(cp->vport));
+ return;
+ }
+ atomic_dec(&ctl_cp->n_control);
+}
+
+static inline void
+ip_vs_control_add(struct ip_vs_conn *cp, struct ip_vs_conn *ctl_cp)
+{
+ if (cp->control) {
+ IP_VS_ERR("request control ADD for already controlled: "
+ "%d.%d.%d.%d:%d to %d.%d.%d.%d:%d\n",
+ NIPQUAD(cp->caddr),ntohs(cp->cport),
+ NIPQUAD(cp->vaddr),ntohs(cp->vport));
+ ip_vs_control_del(cp);
+ }
+
+ IP_VS_DBG(7, "ADDing control for: "
+ "cp.dst=%d.%d.%d.%d:%d ctl_cp.dst=%d.%d.%d.%d:%d\n",
+ NIPQUAD(cp->caddr),ntohs(cp->cport),
+ NIPQUAD(ctl_cp->caddr),ntohs(ctl_cp->cport));
+
+ cp->control = ctl_cp;
+ atomic_inc(&ctl_cp->n_control);
+}
+
+
+/*
+ * IPVS application functions
+ * (from ip_vs_app.c)
+ */
+#define IP_VS_APP_MAX_PORTS 8
+extern int register_ip_vs_app(struct ip_vs_app *app);
+extern void unregister_ip_vs_app(struct ip_vs_app *app);
+extern int ip_vs_bind_app(struct ip_vs_conn *cp, struct ip_vs_protocol *pp);
+extern void ip_vs_unbind_app(struct ip_vs_conn *cp);
+extern int
+register_ip_vs_app_inc(struct ip_vs_app *app, __u16 proto, __u16 port);
+extern int ip_vs_app_inc_get(struct ip_vs_app *inc);
+extern void ip_vs_app_inc_put(struct ip_vs_app *inc);
+
+extern int ip_vs_app_pkt_out(struct ip_vs_conn *, struct sk_buff *skb);
+extern int ip_vs_app_pkt_in(struct ip_vs_conn *, struct sk_buff *skb);
+extern int ip_vs_skb_replace(struct sk_buff *skb, int pri,
+ char *o_buf, int o_len, char *n_buf, int n_len);
+extern int ip_vs_app_init(void);
+extern void ip_vs_app_cleanup(void);
+
+
+/*
+ * IPVS protocol functions (from ip_vs_proto.c)
+ */
+extern int ip_vs_protocol_init(void);
+extern void ip_vs_protocol_cleanup(void);
+extern void ip_vs_protocol_timeout_change(int flags);
+extern int *ip_vs_create_timeout_table(int *table, int size);
+extern int
+ip_vs_set_state_timeout(int *table, int num, char **names, char *name, int to);
+extern struct ip_vs_protocol ip_vs_protocol_tcp;
+extern struct ip_vs_protocol ip_vs_protocol_udp;
+extern struct ip_vs_protocol ip_vs_protocol_icmp;
+extern struct ip_vs_protocol ip_vs_protocol_esp;
+extern struct ip_vs_protocol ip_vs_protocol_ah;
+
+
+/*
+ * Registering/unregistering scheduler functions
+ * (from ip_vs_sched.c)
+ */
+extern int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler);
+extern int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler);
+extern int ip_vs_bind_scheduler(struct ip_vs_service *svc,
+ struct ip_vs_scheduler *scheduler);
+extern int ip_vs_unbind_scheduler(struct ip_vs_service *svc);
+extern struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name);
+extern void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler);
+extern struct ip_vs_conn *
+ip_vs_schedule(struct ip_vs_service *svc, struct iphdr *iph);
+extern int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
+ struct ip_vs_protocol *pp, union ip_vs_tphdr h);
+
+
+/*
+ * IPVS control data and functions (from ip_vs_ctl.c)
+ */
+extern int sysctl_ip_vs_cache_bypass;
+extern int sysctl_ip_vs_expire_nodest_conn;
+extern int sysctl_ip_vs_sync_threshold[2];
+extern int sysctl_ip_vs_nat_icmp_send;
+extern atomic_t ip_vs_dropentry;
+extern struct ip_vs_stats ip_vs_stats;
+
+extern struct ip_vs_service *
+ip_vs_service_get(__u32 fwmark, __u16 protocol, __u32 vaddr, __u16 vport);
+
+static inline void ip_vs_service_put(struct ip_vs_service *svc)
+{
+ atomic_dec(&svc->usecnt);
+}
+
+extern struct ip_vs_dest *
+ip_vs_lookup_real_service(__u16 protocol, __u32 daddr, __u16 dport);
+extern int ip_vs_use_count_inc(void);
+extern void ip_vs_use_count_dec(void);
+extern void update_defense_level(void);
+extern int ip_vs_control_init(void);
+extern void ip_vs_control_cleanup(void);
+
+
+/*
+ * IPVS sync daemon data and function prototypes
+ * (from ip_vs_sync.c)
+ */
+extern volatile int ip_vs_sync_state;
+extern volatile int ip_vs_master_syncid;
+extern volatile int ip_vs_backup_syncid;
+extern char ip_vs_master_mcast_ifn[IP_VS_IFNAME_MAXLEN];
+extern char ip_vs_backup_mcast_ifn[IP_VS_IFNAME_MAXLEN];
+extern int start_sync_thread(int state, char *mcast_ifn, __u8 syncid);
+extern int stop_sync_thread(int state);
+extern void ip_vs_sync_conn(struct ip_vs_conn *cp);
+
+
+/*
+ * IPVS rate estimator prototypes (from ip_vs_est.c)
+ */
+extern int ip_vs_new_estimator(struct ip_vs_stats *stats);
+extern void ip_vs_kill_estimator(struct ip_vs_stats *stats);
+extern void ip_vs_zero_estimator(struct ip_vs_stats *stats);
+
+
+/*
+ * Slow timer functions for IPVS
+ * (from ip_vs_timer.c)
+ */
+extern void add_sltimer(struct sltimer_list *timer);
+extern int del_sltimer(struct sltimer_list *timer);
+extern void mod_sltimer(struct sltimer_list *timer, unsigned long expires);
+extern void ip_vs_sltimer_init(void);
+extern void ip_vs_sltimer_cleanup(void);
+static inline void init_sltimer(struct sltimer_list *timer)
+{
+ timer->list.next = timer->list.prev = NULL;
+}
+
+static inline int sltimer_pending(const struct sltimer_list *timer)
+{
+ return timer->list.next != NULL;
+}
+
+
+/*
+ * Various IPVS packet transmitters (from ip_vs_xmit.c)
+ */
+extern int ip_vs_null_xmit
+(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp);
+extern int ip_vs_bypass_xmit
+(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp);
+extern int ip_vs_nat_xmit
+(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp);
+extern int ip_vs_tunnel_xmit
+(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp);
+extern int ip_vs_dr_xmit
+(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp);
+extern int ip_vs_icmp_xmit
+(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp);
+extern void ip_vs_dst_reset(struct ip_vs_dest *dest);
+
+
+/*
+ * This is a simple mechanism to ignore packets when
+ * we are loaded. Just set ip_vs_drop_rate to 'n' and
+ * we start to drop 1/rate of the packets
+ */
+extern int ip_vs_drop_rate;
+extern int ip_vs_drop_counter;
+
+static __inline__ int ip_vs_todrop(void)
+{
+ if (!ip_vs_drop_rate) return 0;
+ if (--ip_vs_drop_counter > 0) return 0;
+ ip_vs_drop_counter = ip_vs_drop_rate;
+ return 1;
+}
+
+/*
+ * ip_vs_fwd_tag returns the forwarding tag of the connection
+ */
+#define IP_VS_FWD_METHOD(cp) (cp->flags & IP_VS_CONN_F_FWD_MASK)
+
+extern __inline__ char ip_vs_fwd_tag(struct ip_vs_conn *cp)
+{
+ char fwd;
+
+ switch (IP_VS_FWD_METHOD(cp)) {
+ case IP_VS_CONN_F_MASQ:
+ fwd = 'M'; break;
+ case IP_VS_CONN_F_LOCALNODE:
+ fwd = 'L'; break;
+ case IP_VS_CONN_F_TUNNEL:
+ fwd = 'T'; break;
+ case IP_VS_CONN_F_DROUTE:
+ fwd = 'R'; break;
+ case IP_VS_CONN_F_BYPASS:
+ fwd = 'B'; break;
+ default:
+ fwd = '?'; break;
+ }
+ return fwd;
+}
+
+
+static inline u16 ip_vs_check_diff(u32 old, u32 new, u16 oldsum)
+{
+ u32 diff[2] = { old, new };
+
+ return csum_fold(csum_partial((char *) diff, sizeof(diff),
+ oldsum ^ 0xFFFF));
+}
+
+#endif /* __KERNEL__ */
+
+#endif /* _IP_VS_H */
If unsure, say Y.
source "net/ipv4/netfilter/Kconfig"
+source "net/ipv4/ipvs/Kconfig"
obj-$(CONFIG_INET_IPCOMP) += ipcomp.o
obj-$(CONFIG_IP_PNP) += ipconfig.o
obj-$(CONFIG_NETFILTER) += netfilter/
+obj-$(CONFIG_IP_VS) += ipvs/
obj-y += xfrm4_policy.o xfrm4_state.o xfrm4_input.o xfrm4_tunnel.o
--- /dev/null
+#
+# IP Virtual Server configuration
+#
+menu "IP: Virtual Server Configuration"
+ depends on INET && NETFILTER
+
+config IP_VS
+ tristate "IP virtual server support (EXPERIMENTAL)"
+ depends on INET && NETFILTER
+ ---help---
+ IP Virtual Server support will let you build a high-performance
+ virtual server based on cluster of two or more real servers. This
+ option must be enabled for at least one of the clustered computers
+ that will take care of intercepting incomming connections to a
+ single IP address and scheduling them to real servers.
+
+ Three request dispatching techniques are implemented, they are
+ virtual server via NAT, virtual server via tunneling and virtual
+ server via direct routing. The several scheduling algorithms can
+ be used to choose which server the connection is directed to,
+ thus load balancing can be achieved among the servers. For more
+ information and its administration program, please visit the
+ following URL:
+ http://www.linuxvirtualserver.org/
+
+ If you want to compile it in kernel, say Y. If you want to compile
+ it as a module, say M here and read Documentation/modules.txt. If
+ unsure, say N.
+
+config IP_VS_DEBUG
+ bool "IP virtual server debugging"
+ depends on IP_VS
+ ---help---
+ Say Y here if you want to get additional messages useful in
+ debugging the IP virtual server code. You can change the debug
+ level in /proc/sys/net/ipv4/vs/debug_level
+
+config IP_VS_TAB_BITS
+ int "IPVS connection table size (the Nth power of 2)"
+ depends on IP_VS
+ default "12"
+ ---help---
+ The IPVS connection hash table uses the chaining scheme to handle
+ hash collisions. Using a big IPVS connection hash table will greatly
+ reduce conflicts when there are hundreds of thousands of connections
+ in the hash table.
+
+ Note the table size must be power of 2. The table size will be the
+ value of 2 to the your input number power. The number to choose is
+ from 8 to 20, the default number is 12, which means the table size
+ is 4096. Don't input the number too small, otherwise you will lose
+ performance on it. You can adapt the table size yourself, according
+ to your virtual server application. It is good to set the table size
+ not far less than the number of connections per second multiplying
+ average lasting time of connection in the table. For example, your
+ virtual server gets 200 connections per second, the connection lasts
+ for 200 seconds in average in the connection table, the table size
+ should be not far less than 200x200, it is good to set the table
+ size 32768 (2**15).
+
+ Another note that each connection occupies 128 bytes effectively and
+ each hash entry uses 8 bytes, so you can estimate how much memory is
+ needed for your box.
+
+choice
+ prompt "IPVS connection hash function"
+ default IP_VS_HASH_JENKINS
+ ---help---
+ IPVS connection hash function is used to hash IPVS connection
+ entries. It takes the protocol, client address and port number
+ <proto, addr, port> (in network order) to compute hash key.
+ Here you need to choose a hash function to compute hash key.
+ The Jenkins hash is recommended by default.
+
+config IP_VS_HASH_SHIFTXOR
+ bool "SHIFTXOR"
+ ---help---
+ The SHIFTXOR hash function is to compute key in the following way:
+ key = ntohl(addr) + ip_vs_conn_rnd;
+ key ^= (key >> IP_VS_CONN_TAB_BITS);
+ key ^= (key >> 23);
+ key = proto ^ key ^ ntohs(port)) & IP_VS_CONN_TAB_MASK;
+
+ The random value ip_vs_conn_rnd is introduced to prevent from
+ hash attack.
+
+config IP_VS_HASH_GOLDENRATIO
+ bool "GOLDENRATIO"
+ ---help---
+ In Knuth's "The Art of Computer Programming", section 6.4, a
+ multiplicative hashing scheme is introduced as a way to write hash
+ function. The key is multiplied by the golden ratio of 2^32
+ (2654435761) to produce a hash result. Note that 2654435761 is
+ also a prime number.
+
+ The GOLDENRATIO hash function is to compute connection hash key
+ in the following way:
+ key = ip_vs_conn_rnd ^ (proto + addr + port);
+ key = ((key * 2654435761) >> (31 - IP_VS_CONN_TAB_BITS))
+ & IP_VS_CONN_TAB_MASK;
+
+config IP_VS_HASH_JENKINS
+ bool "JENKINS"
+ ---help---
+ The Jenkins hash support is included in the Linux kernel, the
+ header file is at linux/include/jhash.h. You can read the
+ http://burtleburtle.net/bob/hash/index.html for more information
+ about the Jenkins hash.
+
+ The Jenkins hash function is used to compute connection hash key
+ in the following way:
+ key = jhash_3words(addr, port, proto, ip_vs_conn_rnd)
+ & IP_VS_CONN_TAB_MASK;
+
+endchoice
+
+comment "IPVS transport protocol load balancing support"
+ depends on IP_VS
+
+config IP_VS_PROTO_TCP
+ bool "TCP load balancing support"
+ depends on IP_VS
+ ---help---
+ This option enables support for load balancing TCP transport
+ protocol. Say Y if unsure.
+
+config IP_VS_PROTO_UDP
+ bool "UDP load balancing support"
+ depends on IP_VS
+ ---help---
+ This option enables support for load balancing UDP transport
+ protocol. Say Y if unsure.
+
+config IP_VS_PROTO_ESP
+ bool "ESP load balancing support"
+ depends on IP_VS
+ ---help---
+ This option enables support for load balancing ESP (Encapsultion
+ Security Payload) transport protocol. Say Y if unsure.
+
+config IP_VS_PROTO_AH
+ bool "AH load balancing support"
+ depends on IP_VS
+ ---help---
+ This option enables support for load balancing AH (Authentication
+ Header) transport protocol. Say Y if unsure.
+
+comment "IPVS scheduler"
+ depends on IP_VS
+
+config IP_VS_RR
+ tristate "round-robin scheduling"
+ depends on IP_VS
+ ---help---
+ The robin-robin scheduling algorithm simply directs network
+ connections to different real servers in a round-robin manner.
+
+ If you want to compile it in kernel, say Y. If you want to compile
+ it as a module, say M here and read Documentation/modules.txt. If
+ unsure, say N.
+
+config IP_VS_WRR
+ tristate "weighted round-robin scheduling"
+ depends on IP_VS
+ ---help---
+ The weighted robin-robin scheduling algorithm directs network
+ connections to different real servers based on server weights
+ in a round-robin manner. Servers with higher weights receive
+ new connections first than those with less weights, and servers
+ with higher weights get more connections than those with less
+ weights and servers with equal weights get equal connections.
+
+ If you want to compile it in kernel, say Y. If you want to compile
+ it as a module, say M here and read Documentation/modules.txt. If
+ unsure, say N.
+
+config IP_VS_LC
+ tristate "least-connection scheduling scheduling"
+ depends on IP_VS
+ ---help---
+ The least-connection scheduling algorithm directs network
+ connections to the server with the least number of active
+ connections.
+
+ If you want to compile it in kernel, say Y. If you want to compile
+ it as a module, say M here and read Documentation/modules.txt. If
+ unsure, say N.
+
+config IP_VS_WLC
+ tristate "weighted least-connection scheduling"
+ depends on IP_VS
+ ---help---
+ The weighted least-connection scheduling algorithm directs network
+ connections to the server with the least active connections
+ normalized by the server weight.
+
+ If you want to compile it in kernel, say Y. If you want to compile
+ it as a module, say M here and read Documentation/modules.txt. If
+ unsure, say N.
+
+config IP_VS_LBLC
+ tristate "locality-based least-connection with replication scheduling"
+ depends on IP_VS
+ ---help---
+ The locality-based least-connection scheduling algorithm is for
+ destination IP load balancing. It is usually used in cache cluster.
+ This algorithm usually directs packet destined for an IP address to
+ its server if the server is alive and under load. If the server is
+ overloaded (its active connection numbers is larger than its weight)
+ and there is a server in its half load, then allocate the weighted
+ least-connection server to this IP address.
+
+ If you want to compile it in kernel, say Y. If you want to compile
+ it as a module, say M here and read Documentation/modules.txt. If
+ unsure, say N.
+
+config IP_VS_LBLCR
+ tristate "locality-based least-connection with replication schedulin"
+ depends on IP_VS
+ ---help---
+ The locality-based least-connection with replication scheduling
+ algorithm is also for destination IP load balancing. It is
+ usually used in cache cluster. It differs from the LBLC scheduling
+ as follows: the load balancer maintains mappings from a target
+ to a set of server nodes that can serve the target. Requests for
+ a target are assigned to the least-connection node in the target's
+ server set. If all the node in the server set are over loaded,
+ it picks up a least-connection node in the cluster and adds it
+ in the sever set for the target. If the server set has not been
+ modified for the specified time, the most loaded node is removed
+ from the server set, in order to avoid high degree of replication.
+
+ If you want to compile it in kernel, say Y. If you want to compile
+ it as a module, say M here and read Documentation/modules.txt. If
+ unsure, say N.
+
+config IP_VS_DH
+ tristate "destination hashing scheduling"
+ depends on IP_VS
+ ---help---
+ The destination hashing scheduling algorithm assigns network
+ connections to the servers through looking up a statically assigned
+ hash table by their destination IP addresses.
+
+ If you want to compile it in kernel, say Y. If you want to compile
+ it as a module, say M here and read Documentation/modules.txt. If
+ unsure, say N.
+
+config IP_VS_SH
+ tristate "source hashing scheduling"
+ depends on IP_VS
+ ---help---
+ The source hashing scheduling algorithm assigns network
+ connections to the servers through looking up a statically assigned
+ hash table by their source IP addresses.
+
+ If you want to compile it in kernel, say Y. If you want to compile
+ it as a module, say M here and read Documentation/modules.txt. If
+ unsure, say N.
+
+config IP_VS_SED
+ tristate "shortest expected delay scheduling"
+ depends on IP_VS
+ ---help---
+ The shortest expected delay scheduling algorithm assigns network
+ connections to the server with the shortest expected delay. The
+ expected delay that the job will experience is (Ci + 1) / Ui if
+ sent to the ith server, in which Ci is the number of connections
+ on the the ith server and Ui is the fixed service rate (weight)
+ of the ith server.
+
+ If you want to compile it in kernel, say Y. If you want to compile
+ it as a module, say M here and read Documentation/modules.txt. If
+ unsure, say N.
+
+config IP_VS_NQ
+ tristate "never queue scheduling"
+ depends on IP_VS
+ ---help---
+ The never queue scheduling algorithm adopts a two-speed model.
+ When there is an idle server available, the job will be sent to
+ the idle server, instead of waiting for a fast one. When there
+ is no idle server available, the job will be sent to the server
+ that minimize its expected delay (The Shortest Expected Delay
+ scheduling algorithm).
+
+ If you want to compile it in kernel, say Y. If you want to compile
+ it as a module, say M here and read Documentation/modules.txt. If
+ unsure, say N.
+
+comment 'IPVS application helper'
+ depends on IP_VS
+
+config IP_VS_FTP
+ tristate "FTP protocol helper"
+ depends on IP_VS
+ ---help---
+ FTP is a protocol that transfers IP address and/or port number in
+ the payload. In the virtual server via Network Address Translation,
+ the IP address and port number of real servers cannot be sent to
+ clients in ftp connections directly, so FTP protocol helper is
+ required for tracking the connection and mangling it back to that of
+ virtual service.
+
+ If you want to compile it in kernel, say Y. If you want to compile
+ it as a module, say M here and read Documentation/modules.txt. If
+ unsure, say N.
+
+endmenu
--- /dev/null
+#
+# Makefile for the IPVS modules on top of IPv4.
+#
+# Note! Dependencies are done automagically by 'make dep', which also
+# removes any old dependencies. DON'T put your own dependencies here
+# unless it's something special (ie not a .c file).
+#
+# Note 2! The CFLAGS definition is now in the main makefile...
+
+
+# IPVS transport protocol load balancing support
+ip_vs_proto-objs-y :=
+ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_TCP) += ip_vs_proto_tcp.o
+ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_UDP) += ip_vs_proto_udp.o
+ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_ESP) += ip_vs_proto_esp.o
+ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_AH) += ip_vs_proto_ah.o
+
+ip_vs-objs := ip_vs_conn.o ip_vs_core.o ip_vs_ctl.o ip_vs_sched.o \
+ ip_vs_xmit.o ip_vs_timer.o ip_vs_app.o ip_vs_sync.o \
+ ip_vs_est.o ip_vs_proto.o ip_vs_proto_icmp.o \
+ $(ip_vs_proto-objs-y)
+
+
+# IPVS core
+obj-$(CONFIG_IP_VS) += ip_vs.o
+
+# IPVS schedulers
+obj-$(CONFIG_IP_VS_RR) += ip_vs_rr.o
+obj-$(CONFIG_IP_VS_WRR) += ip_vs_wrr.o
+obj-$(CONFIG_IP_VS_LC) += ip_vs_lc.o
+obj-$(CONFIG_IP_VS_WLC) += ip_vs_wlc.o
+obj-$(CONFIG_IP_VS_LBLC) += ip_vs_lblc.o
+obj-$(CONFIG_IP_VS_LBLCR) += ip_vs_lblcr.o
+obj-$(CONFIG_IP_VS_DH) += ip_vs_dh.o
+obj-$(CONFIG_IP_VS_SH) += ip_vs_sh.o
+obj-$(CONFIG_IP_VS_SED) += ip_vs_sed.o
+obj-$(CONFIG_IP_VS_NQ) += ip_vs_nq.o
+
+# IPVS application helpers
+obj-$(CONFIG_IP_VS_FTP) += ip_vs_ftp.o
--- /dev/null
+/*
+ * ip_vs_app.c: Application module support for IPVS
+ *
+ * Version: $Id: ip_vs_app.c,v 1.17 2003/03/22 06:31:21 wensong Exp $
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Most code here is taken from ip_masq_app.c in kernel 2.2. The difference
+ * is that ip_vs_app module handles the reverse direction (incoming requests
+ * and outgoing responses).
+ *
+ * IP_MASQ_APP application masquerading module
+ *
+ * Author: Juan Jose Ciarlante, <jjciarla@raiz.uncu.edu.ar>
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/init.h>
+#include <net/protocol.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <asm/system.h>
+#include <linux/stat.h>
+#include <linux/proc_fs.h>
+
+#include <net/ip_vs.h>
+
+EXPORT_SYMBOL(register_ip_vs_app);
+EXPORT_SYMBOL(unregister_ip_vs_app);
+EXPORT_SYMBOL(register_ip_vs_app_inc);
+
+/* ipvs application list head */
+static LIST_HEAD(ip_vs_app_list);
+static DECLARE_MUTEX(__ip_vs_app_mutex);
+
+
+/*
+ * Get an ip_vs_app object
+ */
+static inline int ip_vs_app_get(struct ip_vs_app *app)
+{
+ /* test and get the module atomically */
+ if (app->module)
+ return try_module_get(app->module);
+ else
+ return 1;
+}
+
+
+static inline void ip_vs_app_put(struct ip_vs_app *app)
+{
+ if (app->module)
+ module_put(app->module);
+}
+
+
+/*
+ * Allocate/initialize app incarnation and register it in proto apps.
+ */
+int
+ip_vs_app_inc_new(struct ip_vs_app *app, __u16 proto, __u16 port)
+{
+ struct ip_vs_protocol *pp;
+ struct ip_vs_app *inc;
+ int ret;
+
+ if (!(pp = ip_vs_proto_get(proto)))
+ return -EPROTONOSUPPORT;
+
+ if (!pp->unregister_app)
+ return -EOPNOTSUPP;
+
+ inc = kmalloc(sizeof(struct ip_vs_app), GFP_KERNEL);
+ if (!inc)
+ return -ENOMEM;
+ memcpy(inc, app, sizeof(*inc));
+ INIT_LIST_HEAD(&inc->p_list);
+ INIT_LIST_HEAD(&inc->incs_list);
+ inc->app = app;
+ inc->port = htons(port);
+ atomic_set(&inc->usecnt, 0);
+
+ if (app->timeouts) {
+ inc->timeout_table =
+ ip_vs_create_timeout_table(app->timeouts,
+ app->timeouts_size);
+ if (!inc->timeout_table) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+
+ ret = pp->register_app(inc);
+ if (ret)
+ goto out;
+
+ list_add(&inc->a_list, &app->incs_list);
+ IP_VS_DBG(9, "%s application %s:%u registered\n",
+ pp->name, inc->name, inc->port);
+
+ return 0;
+
+ out:
+ if (inc->timeout_table)
+ kfree(inc->timeout_table);
+ kfree(inc);
+ return ret;
+}
+
+
+/*
+ * Release app incarnation
+ */
+static void
+ip_vs_app_inc_release(struct ip_vs_app *inc)
+{
+ struct ip_vs_protocol *pp;
+
+ if (!(pp = ip_vs_proto_get(inc->protocol)))
+ return;
+
+ if (pp->unregister_app)
+ pp->unregister_app(inc);
+
+ IP_VS_DBG(9, "%s App %s:%u unregistered\n",
+ pp->name, inc->name, inc->port);
+
+ list_del(&inc->a_list);
+
+ if (inc->timeout_table != NULL)
+ kfree(inc->timeout_table);
+ kfree(inc);
+}
+
+
+/*
+ * Get reference to app inc (only called from softirq)
+ *
+ */
+int ip_vs_app_inc_get(struct ip_vs_app *inc)
+{
+ int result;
+
+ atomic_inc(&inc->usecnt);
+ if (unlikely((result = ip_vs_app_get(inc->app)) != 1))
+ atomic_dec(&inc->usecnt);
+ return result;
+}
+
+
+/*
+ * Put the app inc (only called from timer or net softirq)
+ */
+void ip_vs_app_inc_put(struct ip_vs_app *inc)
+{
+ ip_vs_app_put(inc->app);
+ atomic_dec(&inc->usecnt);
+}
+
+
+/*
+ * Register an application incarnation in protocol applications
+ */
+int
+register_ip_vs_app_inc(struct ip_vs_app *app, __u16 proto, __u16 port)
+{
+ int result;
+
+ down(&__ip_vs_app_mutex);
+
+ result = ip_vs_app_inc_new(app, proto, port);
+
+ up(&__ip_vs_app_mutex);
+
+ return result;
+}
+
+
+/*
+ * ip_vs_app registration routine
+ */
+int register_ip_vs_app(struct ip_vs_app *app)
+{
+ /* increase the module use count */
+ ip_vs_use_count_inc();
+
+ down(&__ip_vs_app_mutex);
+
+ list_add(&app->a_list, &ip_vs_app_list);
+
+ up(&__ip_vs_app_mutex);
+
+ return 0;
+}
+
+
+/*
+ * ip_vs_app unregistration routine
+ * We are sure there are no app incarnations attached to services
+ */
+void unregister_ip_vs_app(struct ip_vs_app *app)
+{
+ struct ip_vs_app *inc;
+ struct list_head *l = &app->incs_list;
+
+ down(&__ip_vs_app_mutex);
+
+ while (l->next != l) {
+ inc = list_entry(l->next, struct ip_vs_app, a_list);
+ ip_vs_app_inc_release(inc);
+ }
+
+ list_del(&app->a_list);
+
+ up(&__ip_vs_app_mutex);
+
+ /* decrease the module use count */
+ ip_vs_use_count_dec();
+}
+
+
+#if 0000
+/*
+ * Get reference to app by name (called from user context)
+ */
+struct ip_vs_app *ip_vs_app_get_by_name(char *appname)
+{
+ struct list_head *p;
+ struct ip_vs_app *app, *a = NULL;
+
+ down(&__ip_vs_app_mutex);
+
+ list_for_each (p, &ip_vs_app_list) {
+ app = list_entry(p, struct ip_vs_app, a_list);
+ if (strcmp(app->name, appname))
+ continue;
+
+ /* softirq may call ip_vs_app_get too, so the caller
+ must disable softirq on the current CPU */
+ if (ip_vs_app_get(app))
+ a = app;
+ break;
+ }
+
+ up(&__ip_vs_app_mutex);
+
+ return a;
+}
+#endif
+
+
+/*
+ * Bind ip_vs_conn to its ip_vs_app (called by cp constructor)
+ */
+int ip_vs_bind_app(struct ip_vs_conn *cp, struct ip_vs_protocol *pp)
+{
+ return pp->app_conn_bind(cp);
+}
+
+
+/*
+ * Unbind cp from application incarnation (called by cp destructor)
+ */
+void ip_vs_unbind_app(struct ip_vs_conn *cp)
+{
+ struct ip_vs_app *inc = cp->app;
+
+ if (!inc)
+ return;
+
+ if (inc->unbind_conn)
+ inc->unbind_conn(inc, cp);
+ if (inc->done_conn)
+ inc->done_conn(inc, cp);
+ ip_vs_app_inc_put(inc);
+ cp->app = NULL;
+}
+
+
+/*
+ * Fixes th->seq based on ip_vs_seq info.
+ */
+static inline void vs_fix_seq(const struct ip_vs_seq *vseq, struct tcphdr *th)
+{
+ __u32 seq = ntohl(th->seq);
+
+ /*
+ * Adjust seq with delta-offset for all packets after
+ * the most recent resized pkt seq and with previous_delta offset
+ * for all packets before most recent resized pkt seq.
+ */
+ if (vseq->delta || vseq->previous_delta) {
+ if(after(seq, vseq->init_seq)) {
+ th->seq = htonl(seq + vseq->delta);
+ IP_VS_DBG(9, "vs_fix_seq(): added delta (%d) to seq\n",
+ vseq->delta);
+ } else {
+ th->seq = htonl(seq + vseq->previous_delta);
+ IP_VS_DBG(9, "vs_fix_seq(): added previous_delta "
+ "(%d) to seq\n", vseq->previous_delta);
+ }
+ }
+}
+
+
+/*
+ * Fixes th->ack_seq based on ip_vs_seq info.
+ */
+static inline void
+vs_fix_ack_seq(const struct ip_vs_seq *vseq, struct tcphdr *th)
+{
+ __u32 ack_seq = ntohl(th->ack_seq);
+
+ /*
+ * Adjust ack_seq with delta-offset for
+ * the packets AFTER most recent resized pkt has caused a shift
+ * for packets before most recent resized pkt, use previous_delta
+ */
+ if (vseq->delta || vseq->previous_delta) {
+ /* since ack_seq is the number of octet that is expected
+ to receive next, so compare it with init_seq+delta */
+ if(after(ack_seq, vseq->init_seq+vseq->delta)) {
+ th->ack_seq = htonl(ack_seq - vseq->delta);
+ IP_VS_DBG(9, "vs_fix_ack_seq(): subtracted delta "
+ "(%d) from ack_seq\n", vseq->delta);
+
+ } else {
+ th->ack_seq = htonl(ack_seq - vseq->previous_delta);
+ IP_VS_DBG(9, "vs_fix_ack_seq(): subtracted "
+ "previous_delta (%d) from ack_seq\n",
+ vseq->previous_delta);
+ }
+ }
+}
+
+
+/*
+ * Updates ip_vs_seq if pkt has been resized
+ * Assumes already checked proto==IPPROTO_TCP and diff!=0.
+ */
+static inline void vs_seq_update(struct ip_vs_conn *cp, struct ip_vs_seq *vseq,
+ unsigned flag, __u32 seq, int diff)
+{
+ /* spinlock is to keep updating cp->flags atomic */
+ spin_lock(&cp->lock);
+ if (!(cp->flags & flag) || after(seq, vseq->init_seq)) {
+ vseq->previous_delta = vseq->delta;
+ vseq->delta += diff;
+ vseq->init_seq = seq;
+ cp->flags |= flag;
+ }
+ spin_unlock(&cp->lock);
+}
+
+
+/*
+ * Output pkt hook. Will call bound ip_vs_app specific function
+ * called by ipvs packet handler, assumes previously checked cp!=NULL
+ * returns (new - old) skb->len diff.
+ */
+int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb)
+{
+ struct ip_vs_app *app;
+ int diff;
+ struct iphdr *iph;
+ struct tcphdr *th;
+ __u32 seq;
+
+ /*
+ * check if application module is bound to
+ * this ip_vs_conn.
+ */
+ if ((app = cp->app) == NULL)
+ return 0;
+
+ iph = skb->nh.iph;
+ th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
+
+ /*
+ * Remember seq number in case this pkt gets resized
+ */
+ seq = ntohl(th->seq);
+
+ /*
+ * Fix seq stuff if flagged as so.
+ */
+ if (cp->protocol == IPPROTO_TCP) {
+ if (cp->flags & IP_VS_CONN_F_OUT_SEQ)
+ vs_fix_seq(&cp->out_seq, th);
+ if (cp->flags & IP_VS_CONN_F_IN_SEQ)
+ vs_fix_ack_seq(&cp->in_seq, th);
+ }
+
+ /*
+ * Call private output hook function
+ */
+ if (app->pkt_out == NULL)
+ return 0;
+
+ diff = app->pkt_out(app, cp, skb);
+
+ /*
+ * Update ip_vs seq stuff if len has changed.
+ */
+ if (diff != 0 && cp->protocol == IPPROTO_TCP)
+ vs_seq_update(cp, &cp->out_seq,
+ IP_VS_CONN_F_OUT_SEQ, seq, diff);
+
+ return diff;
+}
+
+
+/*
+ * Input pkt hook. Will call bound ip_vs_app specific function
+ * called by ipvs packet handler, assumes previously checked cp!=NULL.
+ * returns (new - old) skb->len diff.
+ */
+int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb)
+{
+ struct ip_vs_app *app;
+ int diff;
+ struct iphdr *iph;
+ struct tcphdr *th;
+ __u32 seq;
+
+ /*
+ * check if application module is bound to
+ * this ip_vs_conn.
+ */
+ if ((app = cp->app) == NULL)
+ return 0;
+
+ iph = skb->nh.iph;
+ th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
+
+ /*
+ * Remember seq number in case this pkt gets resized
+ */
+ seq = ntohl(th->seq);
+
+ /*
+ * Fix seq stuff if flagged as so.
+ */
+ if (cp->protocol == IPPROTO_TCP) {
+ if (cp->flags & IP_VS_CONN_F_IN_SEQ)
+ vs_fix_seq(&cp->in_seq, th);
+ if (cp->flags & IP_VS_CONN_F_OUT_SEQ)
+ vs_fix_ack_seq(&cp->out_seq, th);
+ }
+
+ /*
+ * Call private input hook function
+ */
+ if (app->pkt_in == NULL)
+ return 0;
+
+ diff = app->pkt_in(app, cp, skb);
+
+ /*
+ * Update ip_vs seq stuff if len has changed.
+ */
+ if (diff != 0 && cp->protocol == IPPROTO_TCP)
+ vs_seq_update(cp, &cp->in_seq,
+ IP_VS_CONN_F_IN_SEQ, seq, diff);
+
+ return diff;
+}
+
+
+/*
+ * /proc/net/ip_vs_app entry function
+ */
+static int
+ip_vs_app_getinfo(char *buffer, char **start, off_t offset, int length)
+{
+ off_t pos=0;
+ int len=0;
+ char temp[64];
+ struct ip_vs_app *app, *inc;
+ struct list_head *e, *i;
+
+ pos = 64;
+ if (pos > offset) {
+ len += sprintf(buffer+len, "%-63s\n",
+ "prot port usecnt name");
+ }
+
+ down(&__ip_vs_app_mutex);
+ list_for_each (e, &ip_vs_app_list) {
+ app = list_entry(e, struct ip_vs_app, a_list);
+ list_for_each (i, &app->incs_list) {
+ inc = list_entry(i, struct ip_vs_app, a_list);
+
+ pos += 64;
+ if (pos <= offset)
+ continue;
+ sprintf(temp, "%-3s %-7u %-6d %-17s",
+ ip_vs_proto_name(inc->protocol),
+ ntohs(inc->port),
+ atomic_read(&inc->usecnt),
+ inc->name);
+ len += sprintf(buffer+len, "%-63s\n", temp);
+ if (pos >= offset+length)
+ goto done;
+ }
+ }
+ done:
+ up(&__ip_vs_app_mutex);
+
+ *start = buffer+len-(pos-offset); /* Start of wanted data */
+ len = pos-offset;
+ if (len > length)
+ len = length;
+ if (len < 0)
+ len = 0;
+ return len;
+}
+
+
+/*
+ * Replace a segment of data with a new segment
+ */
+int ip_vs_skb_replace(struct sk_buff *skb, int pri,
+ char *o_buf, int o_len, char *n_buf, int n_len)
+{
+ struct iphdr *iph;
+ int diff;
+ int o_offset;
+ int o_left;
+
+ EnterFunction(9);
+
+ diff = n_len - o_len;
+ o_offset = o_buf - (char *)skb->data;
+ /* The length of left data after o_buf+o_len in the skb data */
+ o_left = skb->len - (o_offset + o_len);
+
+ if (diff <= 0) {
+ memmove(o_buf + n_len, o_buf + o_len, o_left);
+ memcpy(o_buf, n_buf, n_len);
+ skb_trim(skb, skb->len + diff);
+ } else if (diff <= skb_tailroom(skb)) {
+ skb_put(skb, diff);
+ memmove(o_buf + n_len, o_buf + o_len, o_left);
+ memcpy(o_buf, n_buf, n_len);
+ } else {
+ if (pskb_expand_head(skb, skb_headroom(skb), diff, pri))
+ return -ENOMEM;
+ skb_put(skb, diff);
+ memmove(skb->data + o_offset + n_len,
+ skb->data + o_offset + o_len, o_left);
+ memcpy(skb->data + o_offset, n_buf, n_len);
+ }
+
+ /* must update the iph total length here */
+ iph = skb->nh.iph;
+ iph->tot_len = htons(skb->len);
+
+ LeaveFunction(9);
+ return 0;
+}
+
+
+int ip_vs_app_init(void)
+{
+ /* we will replace it with proc_net_ipvs_create() soon */
+ proc_net_create("ip_vs_app", 0, ip_vs_app_getinfo);
+ return 0;
+}
+
+
+void ip_vs_app_cleanup(void)
+{
+ proc_net_remove("ip_vs_app");
+}
--- /dev/null
+/*
+ * IPVS An implementation of the IP virtual server support for the
+ * LINUX operating system. IPVS is now implemented as a module
+ * over the Netfilter framework. IPVS can be used to build a
+ * high-performance and highly available server based on a
+ * cluster of servers.
+ *
+ * Version: $Id: ip_vs_conn.c,v 1.31 2003/04/18 09:03:16 wensong Exp $
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ * Peter Kese <peter.kese@ijs.si>
+ * Julian Anastasov <ja@ssi.bg>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
+ * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
+ * and others. Many code here is taken from IP MASQ code of kernel 2.2.
+ *
+ * Changes:
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/compiler.h>
+#include <linux/vmalloc.h>
+#include <linux/proc_fs.h> /* for proc_net_* */
+#include <linux/jhash.h>
+
+#include <net/ip_vs.h>
+
+
+/*
+ * Connection hash table: for input and output packets lookups of IPVS
+ */
+static struct list_head *ip_vs_conn_tab;
+
+/* SLAB cache for IPVS connections */
+static kmem_cache_t *ip_vs_conn_cachep;
+
+/* counter for current IPVS connections */
+static atomic_t ip_vs_conn_count = ATOMIC_INIT(0);
+
+/* counter for no client port connections */
+static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0);
+
+/* random value for IPVS connection hash */
+static unsigned int ip_vs_conn_rnd;
+
+/*
+ * Fine locking granularity for big connection hash table
+ */
+#define CT_LOCKARRAY_BITS 4
+#define CT_LOCKARRAY_SIZE (1<<CT_LOCKARRAY_BITS)
+#define CT_LOCKARRAY_MASK (CT_LOCKARRAY_SIZE-1)
+
+struct ip_vs_aligned_lock
+{
+ rwlock_t l;
+} __attribute__((__aligned__(SMP_CACHE_BYTES)));
+
+/* lock array for conn table */
+struct ip_vs_aligned_lock
+__ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned;
+
+static inline void ct_read_lock(unsigned key)
+{
+ read_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+}
+
+static inline void ct_read_unlock(unsigned key)
+{
+ read_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+}
+
+static inline void ct_write_lock(unsigned key)
+{
+ write_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+}
+
+static inline void ct_write_unlock(unsigned key)
+{
+ write_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+}
+
+static inline void ct_read_lock_bh(unsigned key)
+{
+ read_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+}
+
+static inline void ct_read_unlock_bh(unsigned key)
+{
+ read_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+}
+
+static inline void ct_write_lock_bh(unsigned key)
+{
+ write_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+}
+
+static inline void ct_write_unlock_bh(unsigned key)
+{
+ write_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+}
+
+
+/*
+ * Returns hash value for IPVS connection entry
+ */
+static inline unsigned
+ip_vs_conn_hashkey(unsigned proto, __u32 addr, __u16 port)
+{
+#ifdef CONFIG_IP_VS_HASH_SHIFTXOR
+ unsigned key = ntohl(addr) + ip_vs_conn_rnd;
+
+ key ^= (key >> IP_VS_CONN_TAB_BITS);
+ key ^= (key >> 23);
+ return (proto ^ key ^ ntohs(port)) & IP_VS_CONN_TAB_MASK;
+#endif
+#ifdef CONFIG_IP_VS_HASH_GOLDENRATIO
+ return (((ip_vs_conn_rnd ^ (proto + addr + port)) * 2654435761UL)
+ >> (31 - IP_VS_CONN_TAB_BITS)) & IP_VS_CONN_TAB_MASK;
+#endif
+#ifdef CONFIG_IP_VS_HASH_JENKINS
+ return jhash_3words(addr, port, proto, ip_vs_conn_rnd)
+ & IP_VS_CONN_TAB_MASK;
+#endif
+}
+
+
+/*
+ * Hashes ip_vs_conn in ip_vs_conn_tab by proto,addr,port.
+ * returns bool success.
+ */
+static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
+{
+ unsigned hash;
+
+ if (cp->flags & IP_VS_CONN_F_HASHED) {
+ IP_VS_ERR("ip_vs_conn_hash(): request for already hashed, "
+ "called from %p\n", __builtin_return_address(0));
+ return 0;
+ }
+
+ /* Hash by protocol, client address and port */
+ hash = ip_vs_conn_hashkey(cp->protocol, cp->caddr, cp->cport);
+
+ ct_write_lock(hash);
+
+ list_add(&cp->c_list, &ip_vs_conn_tab[hash]);
+ cp->flags |= IP_VS_CONN_F_HASHED;
+ atomic_inc(&cp->refcnt);
+
+ ct_write_unlock(hash);
+
+ return 1;
+}
+
+
+/*
+ * UNhashes ip_vs_conn from ip_vs_conn_tab.
+ * returns bool success.
+ */
+static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)
+{
+ unsigned hash;
+
+ if (!(cp->flags & IP_VS_CONN_F_HASHED)) {
+ IP_VS_ERR("ip_vs_conn_unhash(): request for unhash flagged, "
+ "called from %p\n", __builtin_return_address(0));
+ return 0;
+ }
+
+ /* unhash it and decrease its reference counter */
+ hash = ip_vs_conn_hashkey(cp->protocol, cp->caddr, cp->cport);
+ ct_write_lock(hash);
+
+ list_del(&cp->c_list);
+ cp->flags &= ~IP_VS_CONN_F_HASHED;
+ atomic_dec(&cp->refcnt);
+
+ ct_write_unlock(hash);
+
+ return 1;
+}
+
+
+/*
+ * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
+ * Called for pkts coming from OUTside-to-INside.
+ * s_addr, s_port: pkt source address (foreign host)
+ * d_addr, d_port: pkt dest address (load balancer)
+ */
+static inline struct ip_vs_conn *__ip_vs_conn_in_get
+(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port)
+{
+ unsigned hash;
+ struct ip_vs_conn *cp;
+ struct list_head *l,*e;
+
+ hash = ip_vs_conn_hashkey(protocol, s_addr, s_port);
+ l = &ip_vs_conn_tab[hash];
+
+ ct_read_lock(hash);
+
+ for (e=l->next; e!=l; e=e->next) {
+ cp = list_entry(e, struct ip_vs_conn, c_list);
+ if (s_addr==cp->caddr && s_port==cp->cport &&
+ d_port==cp->vport && d_addr==cp->vaddr &&
+ protocol==cp->protocol) {
+ /* HIT */
+ atomic_inc(&cp->refcnt);
+ ct_read_unlock(hash);
+ return cp;
+ }
+ }
+
+ ct_read_unlock(hash);
+
+ return NULL;
+}
+
+struct ip_vs_conn *ip_vs_conn_in_get
+(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port)
+{
+ struct ip_vs_conn *cp;
+
+ cp = __ip_vs_conn_in_get(protocol, s_addr, s_port, d_addr, d_port);
+ if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt))
+ cp = __ip_vs_conn_in_get(protocol, s_addr, 0, d_addr, d_port);
+
+ IP_VS_DBG(7, "lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
+ ip_vs_proto_name(protocol),
+ NIPQUAD(s_addr), ntohs(s_port),
+ NIPQUAD(d_addr), ntohs(d_port),
+ cp?"hit":"not hit");
+
+ return cp;
+}
+
+
+/*
+ * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
+ * Called for pkts coming from inside-to-OUTside.
+ * s_addr, s_port: pkt source address (inside host)
+ * d_addr, d_port: pkt dest address (foreign host)
+ */
+struct ip_vs_conn *ip_vs_conn_out_get
+(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port)
+{
+ unsigned hash;
+ struct ip_vs_conn *cp, *ret=NULL;
+ struct list_head *l,*e;
+
+ /*
+ * Check for "full" addressed entries
+ */
+ hash = ip_vs_conn_hashkey(protocol, d_addr, d_port);
+ l = &ip_vs_conn_tab[hash];
+
+ ct_read_lock(hash);
+
+ for (e=l->next; e!=l; e=e->next) {
+ cp = list_entry(e, struct ip_vs_conn, c_list);
+ if (d_addr == cp->caddr && d_port == cp->cport &&
+ s_port == cp->dport && s_addr == cp->daddr &&
+ protocol == cp->protocol) {
+ /* HIT */
+ atomic_inc(&cp->refcnt);
+ ret = cp;
+ break;
+ }
+ }
+
+ ct_read_unlock(hash);
+
+ IP_VS_DBG(7, "lookup/out %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
+ ip_vs_proto_name(protocol),
+ NIPQUAD(s_addr), ntohs(s_port),
+ NIPQUAD(d_addr), ntohs(d_port),
+ ret?"hit":"not hit");
+
+ return ret;
+}
+
+
+/*
+ * Put back the conn and restart its timer with its timeout
+ */
+void ip_vs_conn_put(struct ip_vs_conn *cp)
+{
+ /* reset it expire in its timeout */
+ mod_sltimer(&cp->timer, jiffies+cp->timeout);
+
+ __ip_vs_conn_put(cp);
+}
+
+
+/*
+ * Fill a no_client_port connection with a client port number
+ */
+void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __u16 cport)
+{
+ atomic_dec(&ip_vs_conn_no_cport_cnt);
+ ip_vs_conn_unhash(cp);
+ cp->flags &= ~IP_VS_CONN_F_NO_CPORT;
+ cp->cport = cport;
+ /* hash on new dport */
+ ip_vs_conn_hash(cp);
+}
+
+
+/*
+ * Bind a connection entry with the corresponding packet_xmit.
+ * Called by ip_vs_conn_new.
+ */
+static inline void ip_vs_bind_xmit(struct ip_vs_conn *cp)
+{
+ switch (IP_VS_FWD_METHOD(cp)) {
+ case IP_VS_CONN_F_MASQ:
+ cp->packet_xmit = ip_vs_nat_xmit;
+ break;
+
+ case IP_VS_CONN_F_TUNNEL:
+ cp->packet_xmit = ip_vs_tunnel_xmit;
+ break;
+
+ case IP_VS_CONN_F_DROUTE:
+ cp->packet_xmit = ip_vs_dr_xmit;
+ break;
+
+ case IP_VS_CONN_F_LOCALNODE:
+ cp->packet_xmit = ip_vs_null_xmit;
+ break;
+
+ case IP_VS_CONN_F_BYPASS:
+ cp->packet_xmit = ip_vs_bypass_xmit;
+ break;
+ }
+}
+
+
+static inline int ip_vs_dest_totalconns(struct ip_vs_dest *dest)
+{
+ return atomic_read(&dest->activeconns)
+ + atomic_read(&dest->inactconns);
+}
+
+/*
+ * Bind a connection entry with a virtual service destination
+ * Called just after a new connection entry is created.
+ */
+static inline void
+ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
+{
+ /* if dest is NULL, then return directly */
+ if (!dest)
+ return;
+
+ /* Increase the refcnt counter of the dest */
+ atomic_inc(&dest->refcnt);
+
+ /* Bind with the destination and its corresponding transmitter */
+ cp->flags |= atomic_read(&dest->conn_flags);
+ cp->dest = dest;
+
+ IP_VS_DBG(9, "Bind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
+ "d:%u.%u.%u.%u:%d fwd:%c s:%u flg:%X cnt:%d destcnt:%d\n",
+ ip_vs_proto_name(cp->protocol),
+ NIPQUAD(cp->caddr), ntohs(cp->cport),
+ NIPQUAD(cp->vaddr), ntohs(cp->vport),
+ NIPQUAD(cp->daddr), ntohs(cp->dport),
+ ip_vs_fwd_tag(cp), cp->state,
+ cp->flags, atomic_read(&cp->refcnt),
+ atomic_read(&dest->refcnt));
+
+ /* Update the connection counters */
+ if (cp->cport || (cp->flags & IP_VS_CONN_F_NO_CPORT)) {
+ /* It is a normal connection, so increase the inactive
+ connection counter because it is in TCP SYNRECV
+ state (inactive) or other protocol inacive state */
+ atomic_inc(&dest->inactconns);
+ } else {
+ /* It is a persistent connection/template, so increase
+ the peristent connection counter */
+ atomic_inc(&dest->persistconns);
+ }
+
+ if (dest->u_threshold != 0 &&
+ ip_vs_dest_totalconns(dest) >= dest->u_threshold)
+ dest->flags |= IP_VS_DEST_F_OVERLOAD;
+}
+
+
+/*
+ * Unbind a connection entry with its VS destination
+ * Called by the ip_vs_conn_expire function.
+ */
+static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp)
+{
+ struct ip_vs_dest *dest = cp->dest;
+
+ if (!dest)
+ return;
+
+ IP_VS_DBG(9, "Unbind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
+ "d:%u.%u.%u.%u:%d fwd:%c s:%u flg:%X cnt:%d destcnt:%d\n",
+ ip_vs_proto_name(cp->protocol),
+ NIPQUAD(cp->caddr), ntohs(cp->cport),
+ NIPQUAD(cp->vaddr), ntohs(cp->vport),
+ NIPQUAD(cp->daddr), ntohs(cp->dport),
+ ip_vs_fwd_tag(cp), cp->state,
+ cp->flags, atomic_read(&cp->refcnt),
+ atomic_read(&dest->refcnt));
+
+ /* Update the connection counters */
+ if (cp->cport || (cp->flags & IP_VS_CONN_F_NO_CPORT)) {
+ /* It is a normal connection, so decrease the inactconns
+ or activeconns counter */
+ if (cp->flags & IP_VS_CONN_F_INACTIVE) {
+ atomic_dec(&dest->inactconns);
+ } else {
+ atomic_dec(&dest->activeconns);
+ }
+ } else {
+ /* It is a persistent connection/template, so decrease
+ the peristent connection counter */
+ atomic_dec(&dest->persistconns);
+ }
+
+ if (dest->l_threshold != 0) {
+ if (ip_vs_dest_totalconns(dest) < dest->l_threshold)
+ dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
+ } else if (dest->u_threshold != 0) {
+ if (ip_vs_dest_totalconns(dest) * 4 < dest->u_threshold * 3)
+ dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
+ } else {
+ if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+ dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
+ }
+
+ /*
+ * Simply decrease the refcnt of the dest, because the
+ * dest will be either in service's destination list
+ * or in the trash.
+ */
+ atomic_dec(&dest->refcnt);
+}
+
+
+/*
+ * Checking if the destination of a connection template is available.
+ * If available, return 1, otherwise invalidate this connection
+ * template and return 0.
+ */
+int ip_vs_check_template(struct ip_vs_conn *ct)
+{
+ struct ip_vs_dest *dest = ct->dest;
+
+ /*
+ * Checking the dest server status.
+ */
+ if ((dest == NULL) ||
+ !(dest->flags & IP_VS_DEST_F_AVAILABLE)) {
+ IP_VS_DBG(9, "check_template: dest not available for "
+ "protocol %s s:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
+ "-> d:%u.%u.%u.%u:%d\n",
+ ip_vs_proto_name(ct->protocol),
+ NIPQUAD(ct->caddr), ntohs(ct->cport),
+ NIPQUAD(ct->vaddr), ntohs(ct->vport),
+ NIPQUAD(ct->daddr), ntohs(ct->dport));
+
+ /*
+ * Invalidate the connection template
+ */
+ ip_vs_conn_unhash(ct);
+ ct->dport = 65535;
+ ct->vport = 65535;
+ ct->cport = 0;
+ ip_vs_conn_hash(ct);
+
+ /*
+ * Simply decrease the refcnt of the template,
+ * don't restart its timer.
+ */
+ atomic_dec(&ct->refcnt);
+ return 0;
+ }
+ return 1;
+}
+
+static void ip_vs_conn_expire(unsigned long data)
+{
+ struct ip_vs_conn *cp = (struct ip_vs_conn *)data;
+
+ cp->timeout = 60*HZ;
+
+ /*
+ * hey, I'm using it
+ */
+ atomic_inc(&cp->refcnt);
+
+ /*
+ * do I control anybody?
+ */
+ if (atomic_read(&cp->n_control))
+ goto expire_later;
+
+ /*
+ * unhash it if it is hashed in the conn table
+ */
+ ip_vs_conn_unhash(cp);
+
+ /*
+ * refcnt==1 implies I'm the only one referrer
+ */
+ if (likely(atomic_read(&cp->refcnt) == 1)) {
+ /* make sure that there is no timer on it now */
+ if (sltimer_pending(&cp->timer))
+ del_sltimer(&cp->timer);
+
+ /* does anybody control me? */
+ if (cp->control)
+ ip_vs_control_del(cp);
+
+ if (unlikely(cp->app != NULL))
+ ip_vs_unbind_app(cp);
+ ip_vs_unbind_dest(cp);
+ //ip_vs_timeout_detach(cp);
+ if (cp->flags & IP_VS_CONN_F_NO_CPORT)
+ atomic_dec(&ip_vs_conn_no_cport_cnt);
+ atomic_dec(&ip_vs_conn_count);
+
+ kmem_cache_free(ip_vs_conn_cachep, cp);
+ return;
+ }
+
+ /* hash it back to the table */
+ ip_vs_conn_hash(cp);
+
+ expire_later:
+ IP_VS_DBG(7, "delayed: refcnt-1=%d conn.n_control=%d\n",
+ atomic_read(&cp->refcnt)-1,
+ atomic_read(&cp->n_control));
+
+ ip_vs_conn_put(cp);
+}
+
+
+void ip_vs_conn_expire_now(struct ip_vs_conn *cp)
+{
+ cp->timeout = 0;
+ mod_sltimer(&cp->timer, jiffies);
+ __ip_vs_conn_put(cp);
+}
+
+
+/*
+ * Create a new connection entry and hash it into the ip_vs_conn_tab
+ */
+struct ip_vs_conn *
+ip_vs_conn_new(int proto, __u32 caddr, __u16 cport, __u32 vaddr, __u16 vport,
+ __u32 daddr, __u16 dport, unsigned flags,
+ struct ip_vs_dest *dest)
+{
+ struct ip_vs_conn *cp;
+ struct ip_vs_protocol *pp = ip_vs_proto_get(proto);
+
+ cp = kmem_cache_alloc(ip_vs_conn_cachep, GFP_ATOMIC);
+ if (cp == NULL) {
+ IP_VS_ERR_RL("ip_vs_conn_new: no memory available.\n");
+ return NULL;
+ }
+
+ memset(cp, 0, sizeof(*cp));
+ INIT_LIST_HEAD(&cp->c_list);
+ init_sltimer(&cp->timer);
+ cp->timer.data = (unsigned long)cp;
+ cp->timer.function = ip_vs_conn_expire;
+ cp->protocol = proto;
+ cp->caddr = caddr;
+ cp->cport = cport;
+ cp->vaddr = vaddr;
+ cp->vport = vport;
+ cp->daddr = daddr;
+ cp->dport = dport;
+ cp->flags = flags;
+ cp->lock = SPIN_LOCK_UNLOCKED;
+
+ /*
+ * Set the entry is referenced by the current thread before hashing
+ * it in the table, so that other thread run ip_vs_random_dropentry
+ * but cannot drop this entry.
+ */
+ atomic_set(&cp->refcnt, 1);
+
+ atomic_set(&cp->n_control, 0);
+ atomic_set(&cp->in_pkts, 0);
+
+ atomic_inc(&ip_vs_conn_count);
+ if (flags & IP_VS_CONN_F_NO_CPORT)
+ atomic_inc(&ip_vs_conn_no_cport_cnt);
+
+ /* Bind the connection with a destination server */
+ ip_vs_bind_dest(cp, dest);
+
+ /* Set its state and timeout */
+ cp->state = 0;
+ cp->timeout = 3*HZ;
+
+ /* Bind its packet transmitter */
+ ip_vs_bind_xmit(cp);
+
+ if (unlikely(pp && atomic_read(&pp->appcnt)))
+ ip_vs_bind_app(cp, pp);
+
+ /* Hash it in the ip_vs_conn_tab finally */
+ ip_vs_conn_hash(cp);
+
+ return cp;
+}
+
+
+/*
+ * /proc/net/ip_vs_conn entries
+ */
+static int
+ip_vs_conn_getinfo(char *buffer, char **start, off_t offset, int length)
+{
+ off_t pos=0;
+ int idx, len=0;
+ char temp[70];
+ struct ip_vs_conn *cp;
+ struct list_head *l, *e;
+
+ pos = 128;
+ if (pos > offset) {
+ len += sprintf(buffer+len, "%-127s\n",
+ "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Expires");
+ }
+
+ for(idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) {
+ /*
+ * Lock is actually only need in next loop
+ * we are called from uspace: must stop bh.
+ */
+ ct_read_lock_bh(idx);
+
+ l = &ip_vs_conn_tab[idx];
+ for (e=l->next; e!=l; e=e->next) {
+ cp = list_entry(e, struct ip_vs_conn, c_list);
+ pos += 128;
+ if (pos <= offset)
+ continue;
+ sprintf(temp,
+ "%-3s %08X %04X %08X %04X %08X %04X %-11s %7lu",
+ ip_vs_proto_name(cp->protocol),
+ ntohl(cp->caddr), ntohs(cp->cport),
+ ntohl(cp->vaddr), ntohs(cp->vport),
+ ntohl(cp->daddr), ntohs(cp->dport),
+ ip_vs_state_name(cp->protocol, cp->state),
+ (cp->timer.expires-jiffies)/HZ);
+ len += sprintf(buffer+len, "%-127s\n", temp);
+ if (pos >= offset+length) {
+ ct_read_unlock_bh(idx);
+ goto done;
+ }
+ }
+ ct_read_unlock_bh(idx);
+ }
+
+ done:
+ *start = buffer+len-(pos-offset); /* Start of wanted data */
+ len = pos-offset;
+ if (len > length)
+ len = length;
+ if (len < 0)
+ len = 0;
+ return len;
+}
+
+
+/*
+ * Randomly drop connection entries before running out of memory
+ */
+static inline int todrop_entry(struct ip_vs_conn *cp)
+{
+ /*
+ * The drop rate array needs tuning for real environments.
+ * Called from timer bh only => no locking
+ */
+ static char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8};
+ static char todrop_counter[9] = {0};
+ int i;
+
+ /* if the conn entry hasn't lasted for 60 seconds, don't drop it.
+ This will leave enough time for normal connection to get
+ through. */
+ if (cp->timeout+jiffies-cp->timer.expires < 60*HZ)
+ return 0;
+
+ /* Don't drop the entry if its number of incoming packets is not
+ located in [0, 8] */
+ i = atomic_read(&cp->in_pkts);
+ if (i > 8 || i < 0) return 0;
+
+ if (!todrop_rate[i]) return 0;
+ if (--todrop_counter[i] > 0) return 0;
+
+ todrop_counter[i] = todrop_rate[i];
+ return 1;
+}
+
+
+void ip_vs_random_dropentry(void)
+{
+ int idx;
+ struct ip_vs_conn *cp;
+ struct list_head *l,*e;
+ struct ip_vs_conn *ct;
+
+ /*
+ * Randomly scan 1/32 of the whole table every second
+ */
+ for (idx = 0; idx < (IP_VS_CONN_TAB_SIZE>>5); idx++) {
+ unsigned hash = net_random() & IP_VS_CONN_TAB_MASK;
+
+ /*
+ * Lock is actually needed in this loop.
+ */
+ ct_write_lock(hash);
+
+ l = &ip_vs_conn_tab[hash];
+ for (e=l->next; e!=l; e=e->next) {
+ cp = list_entry(e, struct ip_vs_conn, c_list);
+ if (!cp->cport && !(cp->flags & IP_VS_CONN_F_NO_CPORT))
+ /* connection template */
+ continue;
+
+ if (cp->protocol == IPPROTO_TCP) {
+ switch(cp->state) {
+ case IP_VS_TCP_S_SYN_RECV:
+ case IP_VS_TCP_S_SYNACK:
+ break;
+
+ case IP_VS_TCP_S_ESTABLISHED:
+ if (todrop_entry(cp))
+ break;
+ continue;
+
+ default:
+ continue;
+ }
+ } else {
+ if (!todrop_entry(cp))
+ continue;
+ }
+
+ /*
+ * Drop the entry, and drop its ct if not referenced
+ */
+ atomic_inc(&cp->refcnt);
+ ct_write_unlock(hash);
+
+ if ((ct = cp->control))
+ atomic_inc(&ct->refcnt);
+ IP_VS_DBG(4, "del connection\n");
+ ip_vs_conn_expire_now(cp);
+ if (ct) {
+ IP_VS_DBG(4, "del conn template\n");
+ ip_vs_conn_expire_now(ct);
+ }
+ ct_write_lock(hash);
+ }
+ ct_write_unlock(hash);
+ }
+}
+
+
+/*
+ * Flush all the connection entries in the ip_vs_conn_tab
+ */
+static void ip_vs_conn_flush(void)
+{
+ int idx;
+ struct ip_vs_conn *cp;
+ struct list_head *l,*e;
+ struct ip_vs_conn *ct;
+
+ flush_again:
+ for (idx=0; idx<IP_VS_CONN_TAB_SIZE; idx++) {
+ /*
+ * Lock is actually needed in this loop.
+ */
+ ct_write_lock_bh(idx);
+
+ l = &ip_vs_conn_tab[idx];
+ for (e=l->next; e!=l; e=e->next) {
+ cp = list_entry(e, struct ip_vs_conn, c_list);
+ atomic_inc(&cp->refcnt);
+ ct_write_unlock(idx);
+
+ if ((ct = cp->control))
+ atomic_inc(&ct->refcnt);
+ IP_VS_DBG(4, "del connection\n");
+ ip_vs_conn_expire_now(cp);
+ if (ct) {
+ IP_VS_DBG(4, "del conn template\n");
+ ip_vs_conn_expire_now(ct);
+ }
+ ct_write_lock(idx);
+ }
+ ct_write_unlock_bh(idx);
+ }
+
+ /* the counter may be not NULL, because maybe some conn entries
+ are run by slow timer handler or unhashed but still referred */
+ if (atomic_read(&ip_vs_conn_count) != 0) {
+ schedule();
+ goto flush_again;
+ }
+}
+
+
+int ip_vs_conn_init(void)
+{
+ int idx;
+
+ /*
+ * Allocate the connection hash table and initialize its list heads
+ */
+ ip_vs_conn_tab = vmalloc(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head));
+ if (!ip_vs_conn_tab)
+ return -ENOMEM;
+
+ /* Allocate ip_vs_conn slab cache */
+ ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn",
+ sizeof(struct ip_vs_conn), 0,
+ SLAB_HWCACHE_ALIGN, NULL, NULL);
+ if (!ip_vs_conn_cachep) {
+ vfree(ip_vs_conn_tab);
+ return -ENOMEM;
+ }
+
+ IP_VS_INFO("Connection hash table configured "
+ "(size=%d, memory=%ldKbytes)\n",
+ IP_VS_CONN_TAB_SIZE,
+ (long)(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head))/1024);
+ IP_VS_DBG(0, "Each connection entry needs %d bytes at least\n",
+ sizeof(struct ip_vs_conn));
+
+ for (idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) {
+ INIT_LIST_HEAD(&ip_vs_conn_tab[idx]);
+ }
+
+ for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++) {
+ __ip_vs_conntbl_lock_array[idx].l = RW_LOCK_UNLOCKED;
+ }
+
+ proc_net_create("ip_vs_conn", 0, ip_vs_conn_getinfo);
+
+ /* calculate the random value for connection hash */
+ ip_vs_conn_rnd =
+ jhash_3words((u32) jiffies, (u32) ip_vs_conn_tab,
+ net_random(), IP_VS_CONN_TAB_SIZE);
+
+ return 0;
+}
+
+
+void ip_vs_conn_cleanup(void)
+{
+ /* flush all the connection entries first */
+ ip_vs_conn_flush();
+
+ /* Release the empty cache */
+ kmem_cache_destroy(ip_vs_conn_cachep);
+ proc_net_remove("ip_vs_conn");
+ vfree(ip_vs_conn_tab);
+}
--- /dev/null
+/*
+ * IPVS An implementation of the IP virtual server support for the
+ * LINUX operating system. IPVS is now implemented as a module
+ * over the Netfilter framework. IPVS can be used to build a
+ * high-performance and highly available server based on a
+ * cluster of servers.
+ *
+ * Version: $Id: ip_vs_core.c,v 1.34 2003/05/10 03:05:23 wensong Exp $
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ * Peter Kese <peter.kese@ijs.si>
+ * Julian Anastasov <ja@ssi.bg>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
+ * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
+ * and others.
+ *
+ * Changes:
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/compiler.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/icmp.h>
+
+#include <net/ip.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <net/icmp.h> /* for icmp_send */
+#include <net/route.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+
+#include <net/ip_vs.h>
+
+
+EXPORT_SYMBOL(register_ip_vs_scheduler);
+EXPORT_SYMBOL(unregister_ip_vs_scheduler);
+EXPORT_SYMBOL(ip_vs_skb_replace);
+EXPORT_SYMBOL(ip_vs_proto_name);
+EXPORT_SYMBOL(ip_vs_conn_new);
+EXPORT_SYMBOL(ip_vs_conn_in_get);
+EXPORT_SYMBOL(ip_vs_conn_out_get);
+EXPORT_SYMBOL(ip_vs_tcp_conn_listen);
+EXPORT_SYMBOL(ip_vs_conn_put);
+#ifdef CONFIG_IP_VS_DEBUG
+EXPORT_SYMBOL(ip_vs_get_debug_level);
+#endif
+EXPORT_SYMBOL(check_for_ip_vs_out);
+
+
+/* ID used in ICMP lookups */
+#define icmp_id(icmph) ((icmph->un).echo.id)
+
+const char *ip_vs_proto_name(unsigned proto)
+{
+ static char buf[20];
+
+ switch (proto) {
+ case IPPROTO_IP:
+ return "IP";
+ case IPPROTO_UDP:
+ return "UDP";
+ case IPPROTO_TCP:
+ return "TCP";
+ case IPPROTO_ICMP:
+ return "ICMP";
+ default:
+ sprintf(buf, "IP_%d", proto);
+ return buf;
+ }
+}
+
+void ip_vs_init_hash_table(struct list_head *table, int rows)
+{
+ while (--rows >= 0)
+ INIT_LIST_HEAD(&table[rows]);
+}
+
+static inline void
+ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
+{
+ struct ip_vs_dest *dest = cp->dest;
+ if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
+ spin_lock(&dest->stats.lock);
+ dest->stats.inpkts++;
+ dest->stats.inbytes += skb->len;
+ spin_unlock(&dest->stats.lock);
+
+ spin_lock(&dest->svc->stats.lock);
+ dest->svc->stats.inpkts++;
+ dest->svc->stats.inbytes += skb->len;
+ spin_unlock(&dest->svc->stats.lock);
+
+ spin_lock(&ip_vs_stats.lock);
+ ip_vs_stats.inpkts++;
+ ip_vs_stats.inbytes += skb->len;
+ spin_unlock(&ip_vs_stats.lock);
+ }
+}
+
+
+static inline void
+ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
+{
+ struct ip_vs_dest *dest = cp->dest;
+ if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
+ spin_lock(&dest->stats.lock);
+ dest->stats.outpkts++;
+ dest->stats.outbytes += skb->len;
+ spin_unlock(&dest->stats.lock);
+
+ spin_lock(&dest->svc->stats.lock);
+ dest->svc->stats.outpkts++;
+ dest->svc->stats.outbytes += skb->len;
+ spin_unlock(&dest->svc->stats.lock);
+
+ spin_lock(&ip_vs_stats.lock);
+ ip_vs_stats.outpkts++;
+ ip_vs_stats.outbytes += skb->len;
+ spin_unlock(&ip_vs_stats.lock);
+ }
+}
+
+
+static inline void
+ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc)
+{
+ spin_lock(&cp->dest->stats.lock);
+ cp->dest->stats.conns++;
+ spin_unlock(&cp->dest->stats.lock);
+
+ spin_lock(&svc->stats.lock);
+ svc->stats.conns++;
+ spin_unlock(&svc->stats.lock);
+
+ spin_lock(&ip_vs_stats.lock);
+ ip_vs_stats.conns++;
+ spin_unlock(&ip_vs_stats.lock);
+}
+
+
+static inline int
+ip_vs_set_state(struct ip_vs_conn *cp, int direction,
+ struct iphdr *iph, union ip_vs_tphdr h,
+ struct ip_vs_protocol *pp)
+{
+ if (unlikely(!pp->state_transition))
+ return 0;
+ return pp->state_transition(cp, direction, iph, h, pp);
+}
+
+
+/*
+ * IPVS persistent scheduling function
+ * It creates a connection entry according to its template if exists,
+ * or selects a server and creates a connection entry plus a template.
+ * Locking: we are svc user (svc->refcnt), so we hold all dests too
+ * Protocols supported: TCP, UDP
+ */
+static struct ip_vs_conn *
+ip_vs_sched_persist(struct ip_vs_service *svc, struct iphdr *iph)
+{
+ struct ip_vs_conn *cp = NULL;
+ struct ip_vs_dest *dest;
+ const __u16 *portp;
+ struct ip_vs_conn *ct;
+ __u16 dport; /* destination port to forward */
+ __u32 snet; /* source network of the client, after masking */
+
+ portp = (__u16 *)&(((char *)iph)[iph->ihl*4]);
+
+ /* Mask saddr with the netmask to adjust template granularity */
+ snet = iph->saddr & svc->netmask;
+
+ IP_VS_DBG(6, "p-schedule: src %u.%u.%u.%u:%u dest %u.%u.%u.%u:%u "
+ "mnet %u.%u.%u.%u\n",
+ NIPQUAD(iph->saddr), ntohs(portp[0]),
+ NIPQUAD(iph->daddr), ntohs(portp[1]),
+ NIPQUAD(snet));
+
+ /*
+ * As far as we know, FTP is a very complicated network protocol, and
+ * it uses control connection and data connections. For active FTP,
+ * FTP server initialize data connection to the client, its source port
+ * is often 20. For passive FTP, FTP server tells the clients the port
+ * that it passively listens to, and the client issues the data
+ * connection. In the tunneling or direct routing mode, the load
+ * balancer is on the client-to-server half of connection, the port
+ * number is unknown to the load balancer. So, a conn template like
+ * <caddr, 0, vaddr, 0, daddr, 0> is created for persistent FTP
+ * service, and a template like <caddr, 0, vaddr, vport, daddr, dport>
+ * is created for other persistent services.
+ */
+ if (portp[1] == svc->port) {
+ /* Check if a template already exists */
+ if (svc->port != FTPPORT)
+ ct = ip_vs_conn_in_get(iph->protocol, snet, 0,
+ iph->daddr, portp[1]);
+ else
+ ct = ip_vs_conn_in_get(iph->protocol, snet, 0,
+ iph->daddr, 0);
+
+ if (!ct || !ip_vs_check_template(ct)) {
+ /*
+ * No template found or the dest of the connection
+ * template is not available.
+ */
+ dest = svc->scheduler->schedule(svc, iph);
+ if (dest == NULL) {
+ IP_VS_DBG(1, "p-schedule: no dest found.\n");
+ return NULL;
+ }
+
+ /*
+ * Create a template like <protocol,caddr,0,
+ * vaddr,vport,daddr,dport> for non-ftp service,
+ * and <protocol,caddr,0,vaddr,0,daddr,0>
+ * for ftp service.
+ */
+ if (svc->port != FTPPORT)
+ ct = ip_vs_conn_new(iph->protocol,
+ snet, 0,
+ iph->daddr, portp[1],
+ dest->addr, dest->port,
+ 0,
+ dest);
+ else
+ ct = ip_vs_conn_new(iph->protocol,
+ snet, 0,
+ iph->daddr, 0,
+ dest->addr, 0,
+ 0,
+ dest);
+ if (ct == NULL)
+ return NULL;
+
+ ct->timeout = svc->timeout;
+ } else {
+ /* set destination with the found template */
+ dest = ct->dest;
+ }
+ dport = dest->port;
+ } else {
+ /*
+ * Note: persistent fwmark-based services and persistent
+ * port zero service are handled here.
+ * fwmark template: <IPPROTO_IP,caddr,0,fwmark,0,daddr,0>
+ * port zero template: <protocol,caddr,0,vaddr,0,daddr,0>
+ */
+ if (svc->fwmark)
+ ct = ip_vs_conn_in_get(IPPROTO_IP, snet, 0,
+ htonl(svc->fwmark), 0);
+ else
+ ct = ip_vs_conn_in_get(iph->protocol, snet, 0,
+ iph->daddr, 0);
+
+ if (!ct || !ip_vs_check_template(ct)) {
+ /*
+ * If it is not persistent port zero, return NULL,
+ * otherwise create a connection template.
+ */
+ if (svc->port)
+ return NULL;
+
+ dest = svc->scheduler->schedule(svc, iph);
+ if (dest == NULL) {
+ IP_VS_DBG(1, "p-schedule: no dest found.\n");
+ return NULL;
+ }
+
+ /*
+ * Create a template according to the service
+ */
+ if (svc->fwmark)
+ ct = ip_vs_conn_new(IPPROTO_IP,
+ snet, 0,
+ htonl(svc->fwmark), 0,
+ dest->addr, 0,
+ 0,
+ dest);
+ else
+ ct = ip_vs_conn_new(iph->protocol,
+ snet, 0,
+ iph->daddr, 0,
+ dest->addr, 0,
+ 0,
+ dest);
+ if (ct == NULL)
+ return NULL;
+
+ ct->timeout = svc->timeout;
+ } else {
+ /* set destination with the found template */
+ dest = ct->dest;
+ }
+ dport = portp[1];
+ }
+
+ /*
+ * Create a new connection according to the template
+ */
+ cp = ip_vs_conn_new(iph->protocol,
+ iph->saddr, portp[0],
+ iph->daddr, portp[1],
+ dest->addr, dport,
+ 0,
+ dest);
+ if (cp == NULL) {
+ ip_vs_conn_put(ct);
+ return NULL;
+ }
+
+ /*
+ * Add its control
+ */
+ ip_vs_control_add(cp, ct);
+ ip_vs_conn_put(ct);
+
+ ip_vs_conn_stats(cp, svc);
+ return cp;
+}
+
+
+/*
+ * IPVS main scheduling function
+ * It selects a server according to the virtual service, and
+ * creates a connection entry.
+ * Protocols supported: TCP, UDP
+ */
+struct ip_vs_conn *
+ip_vs_schedule(struct ip_vs_service *svc, struct iphdr *iph)
+{
+ struct ip_vs_conn *cp = NULL;
+ struct ip_vs_dest *dest;
+ const __u16 *portp;
+
+ /*
+ * Persistent service
+ */
+ if (svc->flags & IP_VS_SVC_F_PERSISTENT)
+ return ip_vs_sched_persist(svc, iph);
+
+ /*
+ * Non-persistent service
+ */
+ portp = (__u16 *)&(((char *)iph)[iph->ihl*4]);
+ if (!svc->fwmark && portp[1] != svc->port) {
+ if (!svc->port)
+ IP_VS_ERR("Schedule: port zero only supported "
+ "in persistent services, "
+ "check your ipvs configuration\n");
+ return NULL;
+ }
+
+ dest = svc->scheduler->schedule(svc, iph);
+ if (dest == NULL) {
+ IP_VS_DBG(1, "Schedule: no dest found.\n");
+ return NULL;
+ }
+
+ /*
+ * Create a connection entry.
+ */
+ cp = ip_vs_conn_new(iph->protocol,
+ iph->saddr, portp[0],
+ iph->daddr, portp[1],
+ dest->addr, dest->port?dest->port:portp[1],
+ 0,
+ dest);
+ if (cp == NULL)
+ return NULL;
+
+ IP_VS_DBG(6, "Schedule fwd:%c c:%u.%u.%u.%u:%u v:%u.%u.%u.%u:%u "
+ "d:%u.%u.%u.%u:%u flg:%X cnt:%d\n",
+ ip_vs_fwd_tag(cp),
+ NIPQUAD(cp->caddr), ntohs(cp->cport),
+ NIPQUAD(cp->vaddr), ntohs(cp->vport),
+ NIPQUAD(cp->daddr), ntohs(cp->dport),
+ cp->flags, atomic_read(&cp->refcnt));
+
+ ip_vs_conn_stats(cp, svc);
+ return cp;
+}
+
+
+/*
+ * Pass or drop the packet.
+ * Called by ip_vs_in, when the virtual service is available but
+ * no destination is available for a new connection.
+ */
+int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
+ struct ip_vs_protocol *pp, union ip_vs_tphdr h)
+{
+ struct iphdr *iph = skb->nh.iph;
+
+ /* if it is fwmark-based service, the cache_bypass sysctl is up
+ and the destination is RTN_UNICAST (and not local), then create
+ a cache_bypass connection entry */
+ if (sysctl_ip_vs_cache_bypass && svc->fwmark
+ && (inet_addr_type(iph->daddr) == RTN_UNICAST)) {
+ int ret, cs;
+ struct ip_vs_conn *cp;
+
+ ip_vs_service_put(svc);
+
+ /* create a new connection entry */
+ IP_VS_DBG(6, "ip_vs_leave: create a cache_bypass entry\n");
+ cp = ip_vs_conn_new(iph->protocol,
+ iph->saddr, h.portp[0],
+ iph->daddr, h.portp[1],
+ 0, 0,
+ IP_VS_CONN_F_BYPASS,
+ NULL);
+ if (cp == NULL) {
+ kfree_skb(skb);
+ return NF_STOLEN;
+ }
+
+ /* statistics */
+ ip_vs_in_stats(cp, skb);
+
+ /* set state */
+ cs = ip_vs_set_state(cp, IP_VS_DIR_INPUT, iph, h, pp);
+
+ /* transmit the first SYN packet */
+ ret = cp->packet_xmit(skb, cp, pp);
+
+ atomic_inc(&cp->in_pkts);
+ ip_vs_conn_put(cp);
+ return ret;
+ }
+
+ /*
+ * When the virtual ftp service is presented, packets destined
+ * for other services on the VIP may get here (except services
+ * listed in the ipvs table), pass the packets, because it is
+ * not ipvs job to decide to drop the packets.
+ */
+ if ((svc->port == FTPPORT) && (h.portp[1] != FTPPORT)) {
+ ip_vs_service_put(svc);
+ return NF_ACCEPT;
+ }
+
+ ip_vs_service_put(svc);
+
+ /*
+ * Notify the client that the destination is unreachable, and
+ * release the socket buffer.
+ * Since it is in IP layer, the TCP socket is not actually
+ * created, the TCP RST packet cannot be sent, instead that
+ * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ
+ */
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+ kfree_skb(skb);
+ return NF_STOLEN;
+}
+
+
+/*
+ * It is hooked before NF_IP_PRI_NAT_SRC at the NF_IP_POST_ROUTING
+ * chain, and is used for VS/NAT.
+ * It detects packets for VS/NAT connections and sends the packets
+ * immediately. This can avoid that iptable_nat mangles the packets
+ * for VS/NAT.
+ */
+static unsigned int ip_vs_post_routing(unsigned int hooknum,
+ struct sk_buff **skb_p,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ struct sk_buff *skb = *skb_p;
+
+ if (!(skb->nfcache & NFC_IPVS_PROPERTY))
+ return NF_ACCEPT;
+
+ /* The packet was sent from IPVS, exit this chain */
+ (*okfn)(skb);
+
+ return NF_STOLEN;
+}
+
+
+/*
+ * Handle ICMP messages in the inside-to-outside direction (outgoing).
+ * Find any that might be relevant, check against existing connections,
+ * forward to the right destination host if relevant.
+ * Currently handles error types - unreachable, quench, ttl exceeded.
+ * (Only used in VS/NAT)
+ */
+static int ip_vs_out_icmp(struct sk_buff **skb_p, int *related)
+{
+ struct sk_buff *skb = *skb_p;
+ struct iphdr *iph;
+ struct icmphdr *icmph;
+ struct iphdr *ciph; /* The ip header contained within the ICMP */
+ unsigned short ihl;
+ unsigned short len;
+ unsigned short clen, cihl;
+ struct ip_vs_conn *cp;
+ struct ip_vs_protocol *pp;
+ union ip_vs_tphdr h;
+
+ *related = 1;
+
+ /* reassemble IP fragments, but will it happen in ICMP packets?? */
+ if (skb->nh.iph->frag_off & __constant_htons(IP_MF|IP_OFFSET)) {
+ skb = ip_defrag(skb);
+ if (!skb)
+ return NF_STOLEN;
+ *skb_p = skb;
+ }
+
+ if (skb_is_nonlinear(skb)) {
+ if (skb_linearize(skb, GFP_ATOMIC) != 0)
+ return NF_DROP;
+ ip_send_check(skb->nh.iph);
+ }
+
+ iph = skb->nh.iph;
+ ihl = iph->ihl << 2;
+ icmph = (struct icmphdr *)((char *)iph + ihl);
+ len = ntohs(iph->tot_len) - ihl;
+ if (len < sizeof(struct icmphdr))
+ return NF_DROP;
+
+ IP_VS_DBG(12, "outgoing ICMP (%d,%d) %u.%u.%u.%u->%u.%u.%u.%u\n",
+ icmph->type, ntohs(icmp_id(icmph)),
+ NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
+
+ /*
+ * Work through seeing if this is for us.
+ * These checks are supposed to be in an order that means easy
+ * things are checked first to speed up processing.... however
+ * this means that some packets will manage to get a long way
+ * down this stack and then be rejected, but that's life.
+ */
+ if ((icmph->type != ICMP_DEST_UNREACH) &&
+ (icmph->type != ICMP_SOURCE_QUENCH) &&
+ (icmph->type != ICMP_TIME_EXCEEDED)) {
+ *related = 0;
+ return NF_ACCEPT;
+ }
+
+ /* Now find the contained IP header */
+ clen = len - sizeof(struct icmphdr);
+ if (clen < sizeof(struct iphdr))
+ return NF_DROP;
+ ciph = (struct iphdr *) (icmph + 1);
+ cihl = ciph->ihl << 2;
+ if (clen < cihl)
+ return NF_DROP;
+
+ pp = ip_vs_proto_get(ciph->protocol);
+ if (!pp)
+ return NF_ACCEPT;
+
+ /* Is the embedded protocol header present? */
+ if (unlikely(ciph->frag_off & __constant_htons(IP_OFFSET) &&
+ (pp->minhlen || pp->dont_defrag)))
+ return NF_ACCEPT;
+
+ /* We need at least TCP/UDP ports here */
+ if (clen < cihl + pp->minhlen_icmp)
+ return NF_DROP;
+
+ h.raw = (char *) ciph + cihl;
+
+ /* Ensure the checksum is correct */
+ if (ip_compute_csum((unsigned char *) icmph, len)) {
+ /* Failed checksum! */
+ IP_VS_DBG(1, "forward ICMP: failed checksum from %d.%d.%d.%d!\n",
+ NIPQUAD(iph->saddr));
+ return NF_DROP;
+ }
+
+ IP_VS_DBG_PKT(11, pp, ciph, "Handling outgoing ICMP for");
+
+ /* ciph content is actually <protocol, caddr, cport, daddr, dport> */
+ cp = pp->conn_out_get(skb, pp, ciph, h, 1);
+ if (!cp)
+ return NF_ACCEPT;
+
+ if (IP_VS_FWD_METHOD(cp) != 0) {
+ IP_VS_ERR("shouldn't reach here, because the box is on the"
+ "half connection in the tun/dr module.\n");
+ }
+
+ /* Now we do real damage to this packet...! */
+ /* First change the source IP address, and recalc checksum */
+ iph->saddr = cp->vaddr;
+ ip_send_check(iph);
+
+ /* Now change the *dest* address in the contained IP */
+ ciph->daddr = cp->vaddr;
+ ip_send_check(ciph);
+
+ /* the TCP/UDP dest port - cannot redo check */
+ if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol)
+ h.portp[1] = cp->vport;
+
+ /* And finally the ICMP checksum */
+ icmph->checksum = 0;
+ icmph->checksum = ip_compute_csum((unsigned char *) icmph, len);
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+ /* do the statistics and put it back */
+ ip_vs_out_stats(cp, skb);
+ __ip_vs_conn_put(cp);
+
+ IP_VS_DBG_PKT(11, pp, ciph, "Forwarding correct outgoing ICMP");
+
+ skb->nfcache |= NFC_IPVS_PROPERTY;
+
+ return NF_ACCEPT;
+}
+
+
+/*
+ * It is hooked at the NF_IP_FORWARD chain, used only for VS/NAT.
+ * Check if outgoing packet belongs to the established ip_vs_conn,
+ * rewrite addresses of the packet and send it on its way...
+ */
+static unsigned int
+ip_vs_out(unsigned int hooknum, struct sk_buff **skb_p,
+ const struct net_device *in, const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ struct sk_buff *skb = *skb_p;
+ struct iphdr *iph;
+ struct ip_vs_protocol *pp;
+ union ip_vs_tphdr h;
+ struct ip_vs_conn *cp;
+ int size, ihl, firstfrag;
+
+ EnterFunction(11);
+
+ if (skb->nfcache & NFC_IPVS_PROPERTY)
+ return NF_ACCEPT;
+
+ iph = skb->nh.iph;
+ if (unlikely(iph->protocol == IPPROTO_ICMP)) {
+ int related, verdict = ip_vs_out_icmp(skb_p, &related);
+
+ if (related)
+ return verdict;
+ }
+
+ pp = ip_vs_proto_get(iph->protocol);
+ if (unlikely(!pp))
+ return NF_ACCEPT;
+
+ /* reassemble IP fragments */
+ if (unlikely(iph->frag_off & __constant_htons(IP_MF|IP_OFFSET) &&
+ !pp->dont_defrag)) {
+ skb = ip_defrag(skb);
+ if (!skb)
+ return NF_STOLEN;
+ iph = skb->nh.iph;
+ *skb_p = skb;
+ }
+
+ /* make sure that protocol header is available in skb data area,
+ note that skb data area may be reallocated. */
+ ihl = iph->ihl << 2;
+ firstfrag = !(iph->frag_off & __constant_htons(IP_OFFSET));
+ /*
+ * WARNING: we can work with !firstfrag packets, make sure
+ * each protocol handler checks for firstfrag
+ */
+ if (firstfrag &&
+ !pskb_may_pull(skb, ihl+pp->minhlen))
+ return NF_DROP;
+ iph = skb->nh.iph;
+ h.raw = (char*) iph + ihl;
+
+ /*
+ * Check if the packet belongs to an existing entry
+ */
+ cp = pp->conn_out_get(skb, pp, iph, h, 0);
+
+ if (unlikely(!cp)) {
+ if (sysctl_ip_vs_nat_icmp_send &&
+ (pp->protocol == IPPROTO_TCP ||
+ pp->protocol == IPPROTO_UDP) &&
+ ip_vs_lookup_real_service(iph->protocol,
+ iph->saddr, h.portp[0])) {
+ /*
+ * Notify the real server: there is no existing
+ * entry if it is not RST packet or not TCP packet.
+ */
+ if (!h.th->rst || iph->protocol != IPPROTO_TCP) {
+ icmp_send(skb, ICMP_DEST_UNREACH,
+ ICMP_PORT_UNREACH, 0);
+ kfree_skb(skb);
+ return NF_STOLEN;
+ }
+ }
+ IP_VS_DBG_PKT(12, pp, iph,
+ "packet continues traversal as normal");
+ if (!pp->dont_defrag)
+ ip_send_check(iph);
+ return NF_ACCEPT;
+ }
+
+ /*
+ * If it has ip_vs_app helper, the helper may change the payload,
+ * so it needs full checksum checking and checksum calculation.
+ * If not, only the header (addr/port) is changed, so it is fast
+ * to do incremental checksum update, and let the destination host
+ * do final checksum checking.
+ */
+
+ if (unlikely(cp->app && !pp->slave && skb_is_nonlinear(skb))) {
+ if (skb_linearize(skb, GFP_ATOMIC) != 0) {
+ ip_vs_conn_put(cp);
+ return NF_DROP;
+ }
+ iph = skb->nh.iph;
+ h.raw = (char*) iph + ihl;
+ }
+
+ size = skb->len - ihl;
+ IP_VS_DBG(11, "O-pkt: %s size=%d\n", pp->name, size);
+
+ /* do TCP/UDP checksum checking if it has application helper */
+ if (unlikely(cp->app && pp->csum_check && !pp->slave)) {
+ if (!pp->csum_check(skb, pp, iph, h, size)) {
+ ip_vs_conn_put(cp);
+ return NF_DROP;
+ }
+ }
+
+ IP_VS_DBG_PKT(11, pp, iph, "Outgoing packet");
+
+ /* mangle the packet */
+ iph->saddr = cp->vaddr;
+ if (pp->snat_handler) {
+ pp->snat_handler(skb, pp, cp, iph, h, size);
+ iph = skb->nh.iph;
+ h.raw = (char*) iph + ihl;
+ }
+ ip_send_check(iph);
+
+ IP_VS_DBG_PKT(10, pp, iph, "After SNAT");
+
+ ip_vs_out_stats(cp, skb);
+ ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, iph, h, pp);
+ ip_vs_conn_put(cp);
+
+ skb->nfcache |= NFC_IPVS_PROPERTY;
+
+ LeaveFunction(11);
+ return NF_ACCEPT;
+}
+
+
+/*
+ * Check if the packet is for VS/NAT connections, then send it
+ * immediately.
+ * Called by ip_fw_compact to detect packets for VS/NAT before
+ * they are changed by ipchains masquerading code.
+ */
+unsigned int
+check_for_ip_vs_out(struct sk_buff **skb_p, int (*okfn)(struct sk_buff *))
+{
+ unsigned int ret;
+
+ ret = ip_vs_out(NF_IP_FORWARD, skb_p, NULL, NULL, NULL);
+ if (ret != NF_ACCEPT) {
+ return ret;
+ } else {
+ /* send the packet immediately if it is already mangled
+ by ip_vs_out */
+ if ((*skb_p)->nfcache & NFC_IPVS_PROPERTY) {
+ (*okfn)(*skb_p);
+ return NF_STOLEN;
+ }
+ }
+ return NF_ACCEPT;
+}
+
+
+/*
+ * Handle ICMP messages in the outside-to-inside direction (incoming).
+ * Find any that might be relevant, check against existing connections,
+ * forward to the right destination host if relevant.
+ * Currently handles error types - unreachable, quench, ttl exceeded
+ */
+static int ip_vs_in_icmp(struct sk_buff **skb_p, int *related)
+{
+ struct sk_buff *skb = *skb_p;
+ struct iphdr *iph;
+ struct icmphdr *icmph;
+ struct iphdr *ciph; /* The ip header contained within the ICMP */
+ unsigned short len;
+ unsigned short clen, cihl;
+ struct ip_vs_conn *cp;
+ struct ip_vs_protocol *pp;
+ union ip_vs_tphdr h;
+ int rc;
+
+ *related = 1;
+ if (skb_is_nonlinear(skb)) {
+ if (skb_linearize(skb, GFP_ATOMIC) != 0)
+ return NF_DROP;
+ ip_send_check(skb->nh.iph);
+ }
+
+ iph = skb->nh.iph;
+ icmph = (struct icmphdr *)((char *)iph+(iph->ihl<<2));
+ len = ntohs(iph->tot_len) - (iph->ihl<<2);
+ if (len < sizeof(struct icmphdr))
+ return NF_DROP;
+
+ IP_VS_DBG(12, "icmp in (%d,%d) %u.%u.%u.%u -> %u.%u.%u.%u\n",
+ icmph->type, ntohs(icmp_id(icmph)),
+ NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
+
+ if ((icmph->type != ICMP_DEST_UNREACH) &&
+ (icmph->type != ICMP_SOURCE_QUENCH) &&
+ (icmph->type != ICMP_TIME_EXCEEDED)) {
+ *related = 0;
+ return NF_ACCEPT;
+ }
+
+ /*
+ * If we get here we have an ICMP error of one of the above 3 types
+ * Now find the contained IP header
+ */
+ clen = len - sizeof(struct icmphdr);
+ if (clen < sizeof(struct iphdr))
+ return NF_DROP;
+ ciph = (struct iphdr *) (icmph + 1);
+ cihl = ciph->ihl << 2;
+ if (clen < cihl)
+ return NF_DROP;
+
+ pp = ip_vs_proto_get(ciph->protocol);
+ if (!pp)
+ return NF_ACCEPT;
+
+ /* Is the embedded protocol header present? */
+ if (unlikely(ciph->frag_off & __constant_htons(IP_OFFSET) &&
+ (pp->minhlen || pp->dont_defrag)))
+ return NF_ACCEPT;
+
+ /* We need at least TCP/UDP ports here */
+ if (clen < cihl + pp->minhlen_icmp)
+ return NF_DROP;
+
+ /* Ensure the checksum is correct */
+ if (ip_compute_csum((unsigned char *) icmph, len)) {
+ /* Failed checksum! */
+ IP_VS_ERR_RL("incoming ICMP: failed checksum from "
+ "%d.%d.%d.%d!\n", NIPQUAD(iph->saddr));
+ return NF_DROP;
+ }
+
+ h.raw = (char *) ciph + cihl;
+
+ IP_VS_DBG_PKT(11, pp, ciph, "Handling incoming ICMP for");
+
+ /* This is pretty much what ip_vs_conn_in_get() does,
+ except parameters are in the reverse order */
+ cp = pp->conn_in_get(skb, pp, ciph, h, 1);
+ if (cp == NULL)
+ return NF_ACCEPT;
+
+ ip_vs_in_stats(cp, skb);
+ rc = ip_vs_icmp_xmit(skb, cp, pp);
+ __ip_vs_conn_put(cp);
+ return rc;
+}
+
+
+/*
+ * Check if it's for virtual services, look it up,
+ * and send it on its way...
+ */
+static unsigned int
+ip_vs_in(unsigned int hooknum, struct sk_buff **skb_p,
+ const struct net_device *in, const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ struct sk_buff *skb = *skb_p;
+ struct iphdr *iph = skb->nh.iph;
+ struct ip_vs_protocol *pp = ip_vs_proto_get(iph->protocol);
+ union ip_vs_tphdr h;
+ struct ip_vs_conn *cp;
+ int ihl, ret, restart;
+ int firstfrag;
+
+ /*
+ * Big tappo: only PACKET_HOST (neither loopback nor mcasts)
+ * ... don't know why 1st test DOES NOT include 2nd (?)
+ */
+ if (unlikely(skb->pkt_type != PACKET_HOST || skb->dev == &loopback_dev)) {
+ IP_VS_DBG(12, "packet type=%d proto=%d daddr=%d.%d.%d.%d ignored\n",
+ skb->pkt_type,
+ iph->protocol,
+ NIPQUAD(iph->daddr));
+ return NF_ACCEPT;
+ }
+
+ if (unlikely(iph->protocol == IPPROTO_ICMP)) {
+ int related, verdict = ip_vs_in_icmp(skb_p, &related);
+
+ if (related)
+ return verdict;
+ }
+
+ /* Protocol supported? */
+ if (unlikely(!pp))
+ return NF_ACCEPT;
+
+ /* make sure that protocol header is available in skb data area,
+ note that skb data area may be reallocated. */
+ ihl = iph->ihl << 2;
+#if 0
+ /* Enable this when not in LOCAL_IN */
+ firstfrag = !(iph->frag_off & __constant_htons(IP_OFFSET));
+ /*
+ * WARNING: we can work with !firstfrag packets, make sure
+ * each protocol handler checks for firstfrag
+ */
+#else
+ firstfrag = 1;
+#endif
+ if (firstfrag &&
+ !pskb_may_pull(skb, ihl+pp->minhlen))
+ return NF_DROP;
+ iph = skb->nh.iph;
+ h.raw = (char*) iph + ihl;
+
+ /*
+ * Check if the packet belongs to an existing connection entry
+ */
+ cp = pp->conn_in_get(skb, pp, iph, h, 0);
+
+ if (unlikely(!cp)) {
+ int v;
+
+ if (!pp->conn_schedule(skb, pp, iph, h, &v, &cp)) {
+ return v;
+ }
+ }
+
+ if (unlikely(!cp)) {
+ /* sorry, all this trouble for a no-hit :) */
+ IP_VS_DBG_PKT(12, pp, iph,
+ "packet continues traversal as normal");
+ return NF_ACCEPT;
+ }
+
+ IP_VS_DBG_PKT(11, pp, iph, "Incoming packet");
+
+ /* Check the server status */
+ if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
+ /* the destination server is not availabe */
+
+ if (sysctl_ip_vs_expire_nodest_conn) {
+ /* try to expire the connection immediately */
+ ip_vs_conn_expire_now(cp);
+ } else {
+ /* don't restart its timer, and silently
+ drop the packet. */
+ __ip_vs_conn_put(cp);
+ }
+ return NF_DROP;
+ }
+
+ ip_vs_in_stats(cp, skb);
+ restart = ip_vs_set_state(cp, IP_VS_DIR_INPUT, iph, h, pp);
+ if (cp->packet_xmit)
+ ret = cp->packet_xmit(skb, cp, pp);
+ else {
+ IP_VS_DBG_RL("warning: packet_xmit is null");
+ ret = NF_ACCEPT;
+ }
+
+ /* increase its packet counter and check if it is needed
+ to be synchronized */
+ atomic_inc(&cp->in_pkts);
+ if (ip_vs_sync_state == IP_VS_STATE_MASTER &&
+ (cp->protocol != IPPROTO_TCP ||
+ cp->state == IP_VS_TCP_S_ESTABLISHED) &&
+ (atomic_read(&cp->in_pkts) % sysctl_ip_vs_sync_threshold[1]
+ == sysctl_ip_vs_sync_threshold[0]))
+ ip_vs_sync_conn(cp);
+
+ ip_vs_conn_put(cp);
+ return ret;
+}
+
+
+/*
+ * It is hooked at the NF_IP_FORWARD chain, in order to catch ICMP
+ * packets destined for 0.0.0.0/0.
+ * When fwmark-based virtual service is used, such as transparent
+ * cache cluster, TCP packets can be marked and routed to ip_vs_in,
+ * but ICMP destined for 0.0.0.0/0 cannot not be easily marked and
+ * sent to ip_vs_in_icmp. So, catch them at the NF_IP_FORWARD chain
+ * and send them to ip_vs_in_icmp.
+ */
+static unsigned int
+ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff **skb_p,
+ const struct net_device *in, const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ struct sk_buff *skb = *skb_p;
+ struct iphdr *iph = skb->nh.iph;
+ int r;
+
+ if (iph->protocol != IPPROTO_ICMP)
+ return NF_ACCEPT;
+
+ if (iph->frag_off & __constant_htons(IP_MF|IP_OFFSET)) {
+ skb = ip_defrag(skb);
+ if (!skb)
+ return NF_STOLEN;
+ *skb_p = skb;
+ }
+
+ return ip_vs_in_icmp(skb_p, &r);
+}
+
+
+/* After packet filtering, forward packet through VS/DR, VS/TUN,
+ or VS/NAT(change destination), so that filtering rules can be
+ applied to IPVS. */
+static struct nf_hook_ops ip_vs_in_ops = {
+ .hook = ip_vs_in,
+ .owner = THIS_MODULE,
+ .pf = PF_INET,
+ .hooknum = NF_IP_LOCAL_IN,
+ .priority = 100,
+};
+
+/* After packet filtering, change source only for VS/NAT */
+static struct nf_hook_ops ip_vs_out_ops = {
+ .hook = ip_vs_out,
+ .owner = THIS_MODULE,
+ .pf = PF_INET,
+ .hooknum = NF_IP_FORWARD,
+ .priority = 100,
+};
+
+/* After packet filtering (but before ip_vs_out_icmp), catch icmp
+ destined for 0.0.0.0/0, which is for incoming IPVS connections */
+static struct nf_hook_ops ip_vs_forward_icmp_ops = {
+ .hook = ip_vs_forward_icmp,
+ .owner = THIS_MODULE,
+ .pf = PF_INET,
+ .hooknum = NF_IP_FORWARD,
+ .priority = 99,
+};
+
+/* Before the netfilter connection tracking, exit from POST_ROUTING */
+static struct nf_hook_ops ip_vs_post_routing_ops = {
+ .hook = ip_vs_post_routing,
+ .owner = THIS_MODULE,
+ .pf = PF_INET,
+ .hooknum = NF_IP_POST_ROUTING,
+ .priority = NF_IP_PRI_NAT_SRC-1,
+};
+
+
+/*
+ * Initialize IP Virtual Server
+ */
+static int __init ip_vs_init(void)
+{
+ int ret;
+
+ ret = ip_vs_control_init();
+ if (ret < 0) {
+ IP_VS_ERR("can't setup control.\n");
+ goto cleanup_nothing;
+ }
+
+ ip_vs_sltimer_init();
+ ip_vs_protocol_init();
+
+ ret = ip_vs_app_init();
+ if (ret < 0) {
+ IP_VS_ERR("can't setup application helper.\n");
+ goto cleanup_protocol;
+ }
+
+ ret = ip_vs_conn_init();
+ if (ret < 0) {
+ IP_VS_ERR("can't setup connection table.\n");
+ goto cleanup_app;
+ }
+
+ ret = nf_register_hook(&ip_vs_in_ops);
+ if (ret < 0) {
+ IP_VS_ERR("can't register in hook.\n");
+ goto cleanup_conn;
+ }
+
+ ret = nf_register_hook(&ip_vs_out_ops);
+ if (ret < 0) {
+ IP_VS_ERR("can't register out hook.\n");
+ goto cleanup_inops;
+ }
+ ret = nf_register_hook(&ip_vs_post_routing_ops);
+ if (ret < 0) {
+ IP_VS_ERR("can't register post_routing hook.\n");
+ goto cleanup_outops;
+ }
+ ret = nf_register_hook(&ip_vs_forward_icmp_ops);
+ if (ret < 0) {
+ IP_VS_ERR("can't register forward_icmp hook.\n");
+ goto cleanup_postroutingops;
+ }
+
+ IP_VS_INFO("ipvs loaded.\n");
+ return ret;
+
+ cleanup_postroutingops:
+ nf_unregister_hook(&ip_vs_post_routing_ops);
+ cleanup_outops:
+ nf_unregister_hook(&ip_vs_out_ops);
+ cleanup_inops:
+ nf_unregister_hook(&ip_vs_in_ops);
+ cleanup_conn:
+ ip_vs_conn_cleanup();
+ cleanup_app:
+ ip_vs_app_cleanup();
+ cleanup_protocol:
+ ip_vs_protocol_cleanup();
+ ip_vs_sltimer_cleanup();
+ ip_vs_control_cleanup();
+ cleanup_nothing:
+ return ret;
+}
+
+static void __exit ip_vs_cleanup(void)
+{
+ nf_unregister_hook(&ip_vs_forward_icmp_ops);
+ nf_unregister_hook(&ip_vs_post_routing_ops);
+ nf_unregister_hook(&ip_vs_out_ops);
+ nf_unregister_hook(&ip_vs_in_ops);
+ ip_vs_conn_cleanup();
+ ip_vs_app_cleanup();
+ ip_vs_protocol_cleanup();
+ ip_vs_sltimer_cleanup();
+ ip_vs_control_cleanup();
+ IP_VS_INFO("ipvs unloaded.\n");
+}
+
+module_init(ip_vs_init);
+module_exit(ip_vs_cleanup);
+MODULE_LICENSE("GPL");
--- /dev/null
+/*
+ * IPVS An implementation of the IP virtual server support for the
+ * LINUX operating system. IPVS is now implemented as a module
+ * over the NetFilter framework. IPVS can be used to build a
+ * high-performance and highly available server based on a
+ * cluster of servers.
+ *
+ * Version: $Id: ip_vs_ctl.c,v 1.36 2003/06/08 09:31:19 wensong Exp $
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ * Peter Kese <peter.kese@ijs.si>
+ * Julian Anastasov <ja@ssi.bg>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/sysctl.h>
+#include <linux/proc_fs.h>
+#include <linux/timer.h>
+#include <linux/swap.h>
+#include <linux/proc_fs.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+
+#include <net/ip.h>
+#include <net/sock.h>
+
+#include <asm/uaccess.h>
+
+#include <net/ip_vs.h>
+
+/* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
+static DECLARE_MUTEX(__ip_vs_mutex);
+
+/* lock for service table */
+rwlock_t __ip_vs_svc_lock = RW_LOCK_UNLOCKED;
+
+/* lock for table with the real services */
+static rwlock_t __ip_vs_rs_lock = RW_LOCK_UNLOCKED;
+
+/* lock for state and timeout tables */
+static rwlock_t __ip_vs_securetcp_lock = RW_LOCK_UNLOCKED;
+
+/* lock for drop entry handling */
+static spinlock_t __ip_vs_dropentry_lock = SPIN_LOCK_UNLOCKED;
+
+/* lock for drop packet handling */
+static spinlock_t __ip_vs_droppacket_lock = SPIN_LOCK_UNLOCKED;
+
+/* 1/rate drop and drop-entry variables */
+int ip_vs_drop_rate = 0;
+int ip_vs_drop_counter = 0;
+atomic_t ip_vs_dropentry = ATOMIC_INIT(0);
+
+/* number of virtual services */
+static int ip_vs_num_services = 0;
+
+/* sysctl variables */
+static int sysctl_ip_vs_drop_entry = 0;
+static int sysctl_ip_vs_drop_packet = 0;
+static int sysctl_ip_vs_secure_tcp = 0;
+static int sysctl_ip_vs_amemthresh = 1024;
+static int sysctl_ip_vs_am_droprate = 10;
+int sysctl_ip_vs_cache_bypass = 0;
+int sysctl_ip_vs_expire_nodest_conn = 0;
+int sysctl_ip_vs_sync_threshold[2] = { 3, 50 };
+int sysctl_ip_vs_nat_icmp_send = 0;
+
+
+#ifdef CONFIG_IP_VS_DEBUG
+static int sysctl_ip_vs_debug_level = 0;
+
+int ip_vs_get_debug_level(void)
+{
+ return sysctl_ip_vs_debug_level;
+}
+#endif
+
+/*
+ * update_defense_level is called from timer bh and from sysctl.
+ */
+void update_defense_level(void)
+{
+ struct sysinfo i;
+ static int old_secure_tcp = 0;
+ int availmem;
+ int nomem;
+ int to_change = -1;
+
+ /* we only count free and buffered memory (in pages) */
+ si_meminfo(&i);
+ availmem = i.freeram + i.bufferram;
+ /* however in linux 2.5 the i.bufferram is total page cache size,
+ we need adjust it */
+ /* si_swapinfo(&i); */
+ /* availmem = availmem - (i.totalswap - i.freeswap); */
+
+ nomem = (availmem < sysctl_ip_vs_amemthresh);
+
+ /* drop_entry */
+ spin_lock(&__ip_vs_dropentry_lock);
+ switch (sysctl_ip_vs_drop_entry) {
+ case 0:
+ atomic_set(&ip_vs_dropentry, 0);
+ break;
+ case 1:
+ if (nomem) {
+ atomic_set(&ip_vs_dropentry, 1);
+ sysctl_ip_vs_drop_entry = 2;
+ } else {
+ atomic_set(&ip_vs_dropentry, 0);
+ }
+ break;
+ case 2:
+ if (nomem) {
+ atomic_set(&ip_vs_dropentry, 1);
+ } else {
+ atomic_set(&ip_vs_dropentry, 0);
+ sysctl_ip_vs_drop_entry = 1;
+ };
+ break;
+ case 3:
+ atomic_set(&ip_vs_dropentry, 1);
+ break;
+ }
+ spin_unlock(&__ip_vs_dropentry_lock);
+
+ /* drop_packet */
+ spin_lock(&__ip_vs_droppacket_lock);
+ switch (sysctl_ip_vs_drop_packet) {
+ case 0:
+ ip_vs_drop_rate = 0;
+ break;
+ case 1:
+ if (nomem) {
+ ip_vs_drop_rate = ip_vs_drop_counter
+ = sysctl_ip_vs_amemthresh /
+ (sysctl_ip_vs_amemthresh-availmem);
+ sysctl_ip_vs_drop_packet = 2;
+ } else {
+ ip_vs_drop_rate = 0;
+ }
+ break;
+ case 2:
+ if (nomem) {
+ ip_vs_drop_rate = ip_vs_drop_counter
+ = sysctl_ip_vs_amemthresh /
+ (sysctl_ip_vs_amemthresh-availmem);
+ } else {
+ ip_vs_drop_rate = 0;
+ sysctl_ip_vs_drop_packet = 1;
+ }
+ break;
+ case 3:
+ ip_vs_drop_rate = sysctl_ip_vs_am_droprate;
+ break;
+ }
+ spin_unlock(&__ip_vs_droppacket_lock);
+
+ /* secure_tcp */
+ write_lock(&__ip_vs_securetcp_lock);
+ switch (sysctl_ip_vs_secure_tcp) {
+ case 0:
+ if (old_secure_tcp >= 2)
+ to_change = 0;
+ break;
+ case 1:
+ if (nomem) {
+ if (old_secure_tcp < 2)
+ to_change = 1;
+ sysctl_ip_vs_secure_tcp = 2;
+ } else {
+ if (old_secure_tcp >= 2)
+ to_change = 0;
+ }
+ break;
+ case 2:
+ if (nomem) {
+ if (old_secure_tcp < 2)
+ to_change = 1;
+ } else {
+ if (old_secure_tcp >= 2)
+ to_change = 0;
+ sysctl_ip_vs_secure_tcp = 1;
+ }
+ break;
+ case 3:
+ if (old_secure_tcp < 2)
+ to_change = 1;
+ break;
+ }
+ old_secure_tcp = sysctl_ip_vs_secure_tcp;
+ if (to_change >= 0)
+ ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1);
+ write_unlock(&__ip_vs_securetcp_lock);
+}
+
+
+int
+ip_vs_use_count_inc(void)
+{
+ return try_module_get(THIS_MODULE);
+}
+
+void
+ip_vs_use_count_dec(void)
+{
+ module_put(THIS_MODULE);
+}
+
+
+/*
+ * Hash table: for virtual service lookups
+ */
+#define IP_VS_SVC_TAB_BITS 8
+#define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS)
+#define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)
+
+/* the service table hashed by <protocol, addr, port> */
+static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
+/* the service table hashed by fwmark */
+static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
+
+/*
+ * Hash table: for real service lookups
+ */
+#define IP_VS_RTAB_BITS 4
+#define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS)
+#define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1)
+
+static struct list_head ip_vs_rtable[IP_VS_RTAB_SIZE];
+
+/*
+ * Trash for destinations
+ */
+static LIST_HEAD(ip_vs_dest_trash);
+
+/*
+ * FTP & NULL virtual service counters
+ */
+static atomic_t ip_vs_ftpsvc_counter = ATOMIC_INIT(0);
+static atomic_t ip_vs_nullsvc_counter = ATOMIC_INIT(0);
+
+
+/*
+ * Returns hash value for virtual service
+ */
+static __inline__ unsigned
+ip_vs_svc_hashkey(unsigned proto, __u32 addr, __u16 port)
+{
+ register unsigned porth = ntohs(port);
+
+ return (proto^ntohl(addr)^(porth>>IP_VS_SVC_TAB_BITS)^porth)
+ & IP_VS_SVC_TAB_MASK;
+}
+
+/*
+ * Returns hash value of fwmark for virtual service lookup
+ */
+static __inline__ unsigned ip_vs_svc_fwm_hashkey(__u32 fwmark)
+{
+ return fwmark & IP_VS_SVC_TAB_MASK;
+}
+
+/*
+ * Hashes a service in the ip_vs_svc_table by <proto,addr,port>
+ * or in the ip_vs_svc_fwm_table by fwmark.
+ * Should be called with locked tables.
+ */
+static int ip_vs_svc_hash(struct ip_vs_service *svc)
+{
+ unsigned hash;
+
+ if (svc->flags & IP_VS_SVC_F_HASHED) {
+ IP_VS_ERR("ip_vs_svc_hash(): request for already hashed, "
+ "called from %p\n", __builtin_return_address(0));
+ return 0;
+ }
+
+ if (svc->fwmark == 0) {
+ /*
+ * Hash it by <protocol,addr,port> in ip_vs_svc_table
+ */
+ hash = ip_vs_svc_hashkey(svc->protocol, svc->addr, svc->port);
+ list_add(&svc->s_list, &ip_vs_svc_table[hash]);
+ } else {
+ /*
+ * Hash it by fwmark in ip_vs_svc_fwm_table
+ */
+ hash = ip_vs_svc_fwm_hashkey(svc->fwmark);
+ list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
+ }
+
+ svc->flags |= IP_VS_SVC_F_HASHED;
+ /* increase its refcnt because it is referenced by the svc table */
+ atomic_inc(&svc->refcnt);
+ return 1;
+}
+
+
+/*
+ * Unhashes a service from ip_vs_svc_table/ip_vs_svc_fwm_table.
+ * Should be called with locked tables.
+ */
+static int ip_vs_svc_unhash(struct ip_vs_service *svc)
+{
+ if (!(svc->flags & IP_VS_SVC_F_HASHED)) {
+ IP_VS_ERR("ip_vs_svc_unhash(): request for unhash flagged, "
+ "called from %p\n", __builtin_return_address(0));
+ return 0;
+ }
+
+ if (svc->fwmark == 0) {
+ /* Remove it from the ip_vs_svc_table table */
+ list_del(&svc->s_list);
+ } else {
+ /* Remove it from the ip_vs_svc_fwm_table table */
+ list_del(&svc->f_list);
+ }
+
+ svc->flags &= ~IP_VS_SVC_F_HASHED;
+ atomic_dec(&svc->refcnt);
+ return 1;
+}
+
+
+/*
+ * Get service by {proto,addr,port} in the service table.
+ */
+static __inline__ struct ip_vs_service *
+__ip_vs_service_get(__u16 protocol, __u32 vaddr, __u16 vport)
+{
+ unsigned hash;
+ struct ip_vs_service *svc;
+ struct list_head *l,*e;
+
+ /* Check for "full" addressed entries */
+ hash = ip_vs_svc_hashkey(protocol, vaddr, vport);
+
+ l = &ip_vs_svc_table[hash];
+ for (e=l->next; e!=l; e=e->next) {
+ svc = list_entry(e, struct ip_vs_service, s_list);
+ if ((svc->addr == vaddr)
+ && (svc->port == vport)
+ && (svc->protocol == protocol)) {
+ /* HIT */
+ atomic_inc(&svc->usecnt);
+ return svc;
+ }
+ }
+
+ return NULL;
+}
+
+
+/*
+ * Get service by {fwmark} in the service table.
+ */
+static __inline__ struct ip_vs_service *__ip_vs_svc_fwm_get(__u32 fwmark)
+{
+ unsigned hash;
+ struct ip_vs_service *svc;
+ struct list_head *l,*e;
+
+ /* Check for fwmark addressed entries */
+ hash = ip_vs_svc_fwm_hashkey(fwmark);
+
+ l = &ip_vs_svc_fwm_table[hash];
+ for (e=l->next; e!=l; e=e->next) {
+ svc = list_entry(e, struct ip_vs_service, f_list);
+ if (svc->fwmark == fwmark) {
+ /* HIT */
+ atomic_inc(&svc->usecnt);
+ return svc;
+ }
+ }
+
+ return NULL;
+}
+
+struct ip_vs_service *
+ip_vs_service_get(__u32 fwmark, __u16 protocol, __u32 vaddr, __u16 vport)
+{
+ struct ip_vs_service *svc;
+
+ read_lock(&__ip_vs_svc_lock);
+
+ /*
+ * Check the table hashed by fwmark first
+ */
+ if (fwmark && (svc = __ip_vs_svc_fwm_get(fwmark)))
+ goto out;
+
+ /*
+ * Check the table hashed by <protocol,addr,port>
+ * for "full" addressed entries
+ */
+ svc = __ip_vs_service_get(protocol, vaddr, vport);
+
+ if (svc == NULL
+ && protocol == IPPROTO_TCP
+ && atomic_read(&ip_vs_ftpsvc_counter)
+ && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) {
+ /*
+ * Check if ftp service entry exists, the packet
+ * might belong to FTP data connections.
+ */
+ svc = __ip_vs_service_get(protocol, vaddr, FTPPORT);
+ }
+
+ if (svc == NULL
+ && atomic_read(&ip_vs_nullsvc_counter)) {
+ /*
+ * Check if the catch-all port (port zero) exists
+ */
+ svc = __ip_vs_service_get(protocol, vaddr, 0);
+ }
+
+ out:
+ read_unlock(&__ip_vs_svc_lock);
+
+ IP_VS_DBG(6, "lookup service: fwm %u %s %u.%u.%u.%u:%u %s\n",
+ fwmark, ip_vs_proto_name(protocol),
+ NIPQUAD(vaddr), ntohs(vport),
+ svc?"hit":"not hit");
+
+ return svc;
+}
+
+
+static inline void
+__ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
+{
+ atomic_inc(&svc->refcnt);
+ dest->svc = svc;
+}
+
+static inline void
+__ip_vs_unbind_svc(struct ip_vs_dest *dest)
+{
+ struct ip_vs_service *svc = dest->svc;
+
+ dest->svc = NULL;
+ if (atomic_dec_and_test(&svc->refcnt))
+ kfree(svc);
+}
+
+
+/*
+ * Returns hash value for real service
+ */
+static __inline__ unsigned ip_vs_rs_hashkey(__u32 addr, __u16 port)
+{
+ register unsigned porth = ntohs(port);
+
+ return (ntohl(addr)^(porth>>IP_VS_RTAB_BITS)^porth)
+ & IP_VS_RTAB_MASK;
+}
+
+/*
+ * Hashes ip_vs_dest in ip_vs_rtable by <proto,addr,port>.
+ * should be called with locked tables.
+ */
+static int ip_vs_rs_hash(struct ip_vs_dest *dest)
+{
+ unsigned hash;
+
+ if (!list_empty(&dest->d_list)) {
+ return 0;
+ }
+
+ /*
+ * Hash by proto,addr,port,
+ * which are the parameters of the real service.
+ */
+ hash = ip_vs_rs_hashkey(dest->addr, dest->port);
+ list_add(&dest->d_list, &ip_vs_rtable[hash]);
+
+ return 1;
+}
+
+/*
+ * UNhashes ip_vs_dest from ip_vs_rtable.
+ * should be called with locked tables.
+ */
+static int ip_vs_rs_unhash(struct ip_vs_dest *dest)
+{
+ /*
+ * Remove it from the ip_vs_rtable table.
+ */
+ if (!list_empty(&dest->d_list)) {
+ list_del(&dest->d_list);
+ INIT_LIST_HEAD(&dest->d_list);
+ }
+
+ return 1;
+}
+
+/*
+ * Lookup real service by <proto,addr,port> in the real service table.
+ */
+struct ip_vs_dest *
+ip_vs_lookup_real_service(__u16 protocol, __u32 daddr, __u16 dport)
+{
+ unsigned hash;
+ struct ip_vs_dest *dest;
+ struct list_head *l,*e;
+
+ /*
+ * Check for "full" addressed entries
+ * Return the first found entry
+ */
+ hash = ip_vs_rs_hashkey(daddr, dport);
+
+ l = &ip_vs_rtable[hash];
+
+ read_lock(&__ip_vs_rs_lock);
+ for (e=l->next; e!=l; e=e->next) {
+ dest = list_entry(e, struct ip_vs_dest, d_list);
+ if ((dest->addr == daddr)
+ && (dest->port == dport)
+ && ((dest->protocol == protocol) ||
+ dest->vfwmark)) {
+ /* HIT */
+ read_unlock(&__ip_vs_rs_lock);
+ return dest;
+ }
+ }
+ read_unlock(&__ip_vs_rs_lock);
+
+ return NULL;
+}
+
+/*
+ * Lookup destination by {addr,port} in the given service
+ */
+static struct ip_vs_dest *
+ip_vs_lookup_dest(struct ip_vs_service *svc, __u32 daddr, __u16 dport)
+{
+ struct ip_vs_dest *dest;
+ struct list_head *l, *e;
+
+ /*
+ * Find the destination for the given service
+ */
+ l = &svc->destinations;
+ for (e=l->next; e!=l; e=e->next) {
+ dest = list_entry(e, struct ip_vs_dest, n_list);
+ if ((dest->addr == daddr) && (dest->port == dport)) {
+ /* HIT */
+ return dest;
+ }
+ }
+
+ return NULL;
+}
+
+
+/*
+ * Lookup dest by {svc,addr,port} in the destination trash.
+ * The destination trash is used to hold the destinations that are removed
+ * from the service table but are still referenced by some conn entries.
+ * The reason to add the destination trash is when the dest is temporary
+ * down (either by administrator or by monitor program), the dest can be
+ * picked back from the trash, the remaining connections to the dest can
+ * continue, and the counting information of the dest is also useful for
+ * scheduling.
+ */
+static struct ip_vs_dest *
+ip_vs_trash_get_dest(struct ip_vs_service *svc, __u32 daddr, __u16 dport)
+{
+ struct ip_vs_dest *dest;
+ struct list_head *l, *e;
+
+ /*
+ * Find the destination in trash
+ */
+ l = &ip_vs_dest_trash;
+
+ for (e=l->next; e!=l; e=e->next) {
+ dest = list_entry(e, struct ip_vs_dest, n_list);
+ IP_VS_DBG(3, "Destination %u/%u.%u.%u.%u:%u still in trash, "
+ "refcnt=%d\n",
+ dest->vfwmark,
+ NIPQUAD(dest->addr), ntohs(dest->port),
+ atomic_read(&dest->refcnt));
+ if (dest->addr == daddr &&
+ dest->port == dport &&
+ dest->vfwmark == svc->fwmark &&
+ dest->protocol == svc->protocol &&
+ (svc->fwmark ||
+ (dest->vaddr == svc->addr &&
+ dest->vport == svc->port))) {
+ /* HIT */
+ return dest;
+ }
+
+ /*
+ * Try to purge the destination from trash if not referenced
+ */
+ if (atomic_read(&dest->refcnt) == 1) {
+ IP_VS_DBG(3, "Removing destination %u/%u.%u.%u.%u:%u "
+ "from trash\n",
+ dest->vfwmark,
+ NIPQUAD(dest->addr), ntohs(dest->port));
+ e = e->prev;
+ list_del(&dest->n_list);
+ ip_vs_dst_reset(dest);
+ __ip_vs_unbind_svc(dest);
+ kfree(dest);
+ }
+ }
+
+ return NULL;
+}
+
+
+/*
+ * Clean up all the destinations in the trash
+ * Called by the ip_vs_control_cleanup()
+ *
+ * When the ip_vs_control_clearup is activated by ipvs module exit,
+ * the service tables must have been flushed and all the connections
+ * are expired, and the refcnt of each destination in the trash must
+ * be 1, so we simply release them here.
+ */
+static void ip_vs_trash_cleanup(void)
+{
+ struct ip_vs_dest *dest;
+ struct list_head *l;
+
+ l = &ip_vs_dest_trash;
+
+ while (l->next != l) {
+ dest = list_entry(l->next, struct ip_vs_dest, n_list);
+ list_del(&dest->n_list);
+ ip_vs_dst_reset(dest);
+ __ip_vs_unbind_svc(dest);
+ kfree(dest);
+ }
+}
+
+
+/*
+ * Update a destination in the given service
+ */
+static void
+__ip_vs_update_dest(struct ip_vs_service *svc,
+ struct ip_vs_dest *dest, struct ip_vs_dest_user *udest)
+{
+ int conn_flags;
+
+ /* set the weight and the flags */
+ atomic_set(&dest->weight, udest->weight);
+ conn_flags = udest->conn_flags | IP_VS_CONN_F_INACTIVE;
+
+ /* check if local node and update the flags */
+ if (inet_addr_type(udest->addr) == RTN_LOCAL) {
+ conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK)
+ | IP_VS_CONN_F_LOCALNODE;
+ }
+
+ /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
+ if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != 0) {
+ conn_flags |= IP_VS_CONN_F_NOOUTPUT;
+ } else {
+ /*
+ * Put the real service in ip_vs_rtable if not present.
+ * For now only for NAT!
+ */
+ write_lock_bh(&__ip_vs_rs_lock);
+ ip_vs_rs_hash(dest);
+ write_unlock_bh(&__ip_vs_rs_lock);
+ }
+ atomic_set(&dest->conn_flags, conn_flags);
+
+ /* bind the service */
+ if (!dest->svc) {
+ __ip_vs_bind_svc(dest, svc);
+ } else {
+ if (dest->svc != svc) {
+ __ip_vs_unbind_svc(dest);
+ __ip_vs_bind_svc(dest, svc);
+ }
+ }
+
+ /* set the dest status flags */
+ dest->flags |= IP_VS_DEST_F_AVAILABLE;
+
+ if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold)
+ dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
+ dest->u_threshold = udest->u_threshold;
+ dest->l_threshold = udest->l_threshold;
+}
+
+
+/*
+ * Create a destination for the given service
+ */
+static int
+ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest,
+ struct ip_vs_dest **dest_p)
+{
+ struct ip_vs_dest *dest;
+ unsigned atype;
+
+ EnterFunction(2);
+
+ atype = inet_addr_type(udest->addr);
+ if (atype != RTN_LOCAL && atype != RTN_UNICAST)
+ return -EINVAL;
+
+ dest = kmalloc(sizeof(struct ip_vs_dest), GFP_ATOMIC);
+ if (dest == NULL) {
+ IP_VS_ERR("ip_vs_new_dest: kmalloc failed.\n");
+ return -ENOMEM;
+ }
+ memset(dest, 0, sizeof(struct ip_vs_dest));
+
+ dest->protocol = svc->protocol;
+ dest->vaddr = svc->addr;
+ dest->vport = svc->port;
+ dest->vfwmark = svc->fwmark;
+ dest->addr = udest->addr;
+ dest->port = udest->port;
+
+ atomic_set(&dest->activeconns, 0);
+ atomic_set(&dest->inactconns, 0);
+ atomic_set(&dest->persistconns, 0);
+ atomic_set(&dest->refcnt, 0);
+
+ INIT_LIST_HEAD(&dest->d_list);
+ dest->dst_lock = SPIN_LOCK_UNLOCKED;
+ dest->stats.lock = SPIN_LOCK_UNLOCKED;
+ __ip_vs_update_dest(svc, dest, udest);
+ ip_vs_new_estimator(&dest->stats);
+
+ *dest_p = dest;
+
+ LeaveFunction(2);
+ return 0;
+}
+
+
+/*
+ * Add a destination into an existing service
+ */
+static int
+ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
+{
+ struct ip_vs_dest *dest;
+ __u32 daddr = udest->addr;
+ __u16 dport = udest->port;
+ int ret;
+
+ EnterFunction(2);
+
+ if (udest->weight < 0) {
+ IP_VS_ERR("ip_vs_add_dest(): server weight less than zero\n");
+ return -ERANGE;
+ }
+
+ /*
+ * Check if the dest already exists in the list
+ */
+ dest = ip_vs_lookup_dest(svc, daddr, dport);
+ if (dest != NULL) {
+ IP_VS_DBG(1, "ip_vs_add_dest(): dest already exists\n");
+ return -EEXIST;
+ }
+
+ /*
+ * Check if the dest already exists in the trash and
+ * is from the same service
+ */
+ dest = ip_vs_trash_get_dest(svc, daddr, dport);
+ if (dest != NULL) {
+ IP_VS_DBG(3, "Get destination %u.%u.%u.%u:%u from trash, "
+ "refcnt=%d, service %u/%u.%u.%u.%u:%u\n",
+ NIPQUAD(daddr), ntohs(dport),
+ atomic_read(&dest->refcnt),
+ dest->vfwmark,
+ NIPQUAD(dest->vaddr),
+ ntohs(dest->vport));
+ __ip_vs_update_dest(svc, dest, udest);
+
+ /*
+ * Get the destination from the trash
+ */
+ list_del(&dest->n_list);
+
+ ip_vs_new_estimator(&dest->stats);
+
+ write_lock_bh(&__ip_vs_svc_lock);
+
+ /*
+ * Wait until all other svc users go away.
+ */
+ IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
+
+ list_add(&dest->n_list, &svc->destinations);
+ svc->num_dests++;
+
+ /* call the update_service function of its scheduler */
+ svc->scheduler->update_service(svc);
+
+ write_unlock_bh(&__ip_vs_svc_lock);
+ return 0;
+ }
+
+ /*
+ * Allocate and initialize the dest structure
+ */
+ ret = ip_vs_new_dest(svc, udest, &dest);
+ if (ret) {
+ return ret;
+ }
+
+ /*
+ * Add the dest entry into the list
+ */
+ atomic_inc(&dest->refcnt);
+
+ write_lock_bh(&__ip_vs_svc_lock);
+
+ /*
+ * Wait until all other svc users go away.
+ */
+ IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
+
+ list_add(&dest->n_list, &svc->destinations);
+ svc->num_dests++;
+
+ /* call the update_service function of its scheduler */
+ svc->scheduler->update_service(svc);
+
+ write_unlock_bh(&__ip_vs_svc_lock);
+
+ LeaveFunction(2);
+
+ return 0;
+}
+
+
+/*
+ * Edit a destination in the given service
+ */
+static int
+ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
+{
+ struct ip_vs_dest *dest;
+ __u32 daddr = udest->addr;
+ __u16 dport = udest->port;
+
+ EnterFunction(2);
+
+ if (udest->weight < 0) {
+ IP_VS_ERR("ip_vs_add_dest(): server weight less than zero\n");
+ return -ERANGE;
+ }
+
+ /*
+ * Lookup the destination list
+ */
+ dest = ip_vs_lookup_dest(svc, daddr, dport);
+ if (dest == NULL) {
+ IP_VS_DBG(1, "ip_vs_edit_dest(): dest doesn't exist\n");
+ return -ENOENT;
+ }
+
+ __ip_vs_update_dest(svc, dest, udest);
+
+ /* call the update_service, because server weight may be changed */
+ svc->scheduler->update_service(svc);
+
+ LeaveFunction(2);
+
+ return 0;
+}
+
+
+/*
+ * Delete a destination (must be already unlinked from the service)
+ */
+static void __ip_vs_del_dest(struct ip_vs_dest *dest)
+{
+ ip_vs_kill_estimator(&dest->stats);
+
+ /*
+ * Remove it from the d-linked list with the real services.
+ */
+ write_lock_bh(&__ip_vs_rs_lock);
+ ip_vs_rs_unhash(dest);
+ write_unlock_bh(&__ip_vs_rs_lock);
+
+ /*
+ * Decrease the refcnt of the dest, and free the dest
+ * if nobody refers to it (refcnt=0). Otherwise, throw
+ * the destination into the trash.
+ */
+ if (atomic_dec_and_test(&dest->refcnt)) {
+ ip_vs_dst_reset(dest);
+ /* simply decrease svc->refcnt here, let the caller check
+ and release the service if nobody refers to it.
+ Only user context can release destination and service,
+ and only one user context can update virtual service at a
+ time, so the operation here is OK */
+ atomic_dec(&dest->svc->refcnt);
+ kfree(dest);
+ } else {
+ IP_VS_DBG(3, "Moving dest %u.%u.%u.%u:%u into trash, refcnt=%d\n",
+ NIPQUAD(dest->addr), ntohs(dest->port),
+ atomic_read(&dest->refcnt));
+ list_add(&dest->n_list, &ip_vs_dest_trash);
+ atomic_inc(&dest->refcnt);
+ }
+}
+
+
+/*
+ * Unlink a destination from the given service
+ */
+static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
+ struct ip_vs_dest *dest,
+ int svcupd)
+{
+ dest->flags &= ~IP_VS_DEST_F_AVAILABLE;
+
+ /*
+ * Remove it from the d-linked destination list.
+ */
+ list_del(&dest->n_list);
+ svc->num_dests--;
+ if (svcupd) {
+ /*
+ * Call the update_service function of its scheduler
+ */
+ svc->scheduler->update_service(svc);
+ }
+}
+
+
+/*
+ * Delete a destination server in the given service
+ */
+static int
+ip_vs_del_dest(struct ip_vs_service *svc,struct ip_vs_dest_user *udest)
+{
+ struct ip_vs_dest *dest;
+ __u32 daddr = udest->addr;
+ __u16 dport = udest->port;
+
+ EnterFunction(2);
+
+ dest = ip_vs_lookup_dest(svc, daddr, dport);
+ if (dest == NULL) {
+ IP_VS_DBG(1, "ip_vs_del_dest(): destination not found!\n");
+ return -ENOENT;
+ }
+
+ write_lock_bh(&__ip_vs_svc_lock);
+
+ /*
+ * Wait until all other svc users go away.
+ */
+ IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
+
+ /*
+ * Unlink dest from the service
+ */
+ __ip_vs_unlink_dest(svc, dest, 1);
+
+ write_unlock_bh(&__ip_vs_svc_lock);
+
+ /*
+ * Delete the destination
+ */
+ __ip_vs_del_dest(dest);
+
+ LeaveFunction(2);
+
+ return 0;
+}
+
+
+/*
+ * Add a service into the service hash table
+ */
+static int
+ip_vs_add_service(struct ip_vs_service_user *u, struct ip_vs_service **svc_p)
+{
+ int ret = 0;
+ struct ip_vs_scheduler *sched = NULL;
+ struct ip_vs_service *svc = NULL;
+
+ /* increase the module use count */
+ ip_vs_use_count_inc();
+
+ /* Lookup the scheduler by 'u->sched_name' */
+ sched = ip_vs_scheduler_get(u->sched_name);
+ if (sched == NULL) {
+ IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
+ u->sched_name);
+ ret = -ENOENT;
+ goto out_mod_dec;
+ }
+
+ svc = (struct ip_vs_service *)
+ kmalloc(sizeof(struct ip_vs_service), GFP_ATOMIC);
+ if (svc == NULL) {
+ IP_VS_DBG(1, "ip_vs_add_service: kmalloc failed.\n");
+ ret = -ENOMEM;
+ goto out_err;
+ }
+ memset(svc, 0, sizeof(struct ip_vs_service));
+
+ /* I'm the first user of the service */
+ atomic_set(&svc->usecnt, 1);
+ atomic_set(&svc->refcnt, 0);
+
+ svc->protocol = u->protocol;
+ svc->addr = u->addr;
+ svc->port = u->port;
+ svc->fwmark = u->fwmark;
+ svc->flags = u->flags;
+ svc->timeout = u->timeout * HZ;
+ svc->netmask = u->netmask;
+
+ INIT_LIST_HEAD(&svc->destinations);
+ svc->sched_lock = RW_LOCK_UNLOCKED;
+ svc->stats.lock = SPIN_LOCK_UNLOCKED;
+
+ /* Bind the scheduler */
+ ret = ip_vs_bind_scheduler(svc, sched);
+ if (ret)
+ goto out_err;
+ sched = NULL;
+
+ /* Update the virtual service counters */
+ if (svc->port == FTPPORT)
+ atomic_inc(&ip_vs_ftpsvc_counter);
+ else if (svc->port == 0)
+ atomic_inc(&ip_vs_nullsvc_counter);
+
+ ip_vs_new_estimator(&svc->stats);
+ ip_vs_num_services++;
+
+ /* Hash the service into the service table */
+ write_lock_bh(&__ip_vs_svc_lock);
+ ip_vs_svc_hash(svc);
+ write_unlock_bh(&__ip_vs_svc_lock);
+
+ *svc_p = svc;
+ return 0;
+
+ out_err:
+ if (svc != NULL) {
+ if (svc->scheduler)
+ ip_vs_unbind_scheduler(svc);
+ if (svc->inc) {
+ local_bh_disable();
+ ip_vs_app_inc_put(svc->inc);
+ local_bh_enable();
+ }
+ kfree(svc);
+ }
+ ip_vs_scheduler_put(sched);
+
+ out_mod_dec:
+ /* decrease the module use count */
+ ip_vs_use_count_dec();
+
+ return ret;
+}
+
+
+/*
+ * Edit a service and bind it with a new scheduler
+ */
+static int
+ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user *u)
+{
+ struct ip_vs_scheduler *sched, *old_sched;
+ int ret = 0;
+
+ /*
+ * Lookup the scheduler, by 'u->sched_name'
+ */
+ sched = ip_vs_scheduler_get(u->sched_name);
+ if (sched == NULL) {
+ IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
+ u->sched_name);
+ return -ENOENT;
+ }
+ old_sched = sched;
+
+ write_lock_bh(&__ip_vs_svc_lock);
+
+ /*
+ * Wait until all other svc users go away.
+ */
+ IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
+
+ /*
+ * Set the flags and timeout value
+ */
+ svc->flags = u->flags | IP_VS_SVC_F_HASHED;
+ svc->timeout = u->timeout * HZ;
+ svc->netmask = u->netmask;
+
+ old_sched = svc->scheduler;
+ if (sched != old_sched) {
+ /*
+ * Unbind the old scheduler
+ */
+ if ((ret = ip_vs_unbind_scheduler(svc))) {
+ old_sched = sched;
+ goto out;
+ }
+
+ /*
+ * Bind the new scheduler
+ */
+ if ((ret = ip_vs_bind_scheduler(svc, sched))) {
+ /*
+ * If ip_vs_bind_scheduler fails, restore the old
+ * scheduler.
+ * The main reason of failure is out of memory.
+ *
+ * The question is if the old scheduler can be
+ * restored all the time. TODO: if it cannot be
+ * restored some time, we must delete the service,
+ * otherwise the system may crash.
+ */
+ ip_vs_bind_scheduler(svc, old_sched);
+ old_sched = sched;
+ goto out;
+ }
+ }
+
+ out:
+ write_unlock_bh(&__ip_vs_svc_lock);
+
+ if (old_sched)
+ ip_vs_scheduler_put(old_sched);
+
+ return ret;
+}
+
+
+/*
+ * Delete a service from the service list
+ * - The service must be unlinked, unlocked and not referenced!
+ * - We are called under _bh lock
+ */
+static void __ip_vs_del_service(struct ip_vs_service *svc)
+{
+ struct list_head *l;
+ struct ip_vs_dest *dest;
+ struct ip_vs_scheduler *old_sched;
+
+ ip_vs_num_services--;
+ ip_vs_kill_estimator(&svc->stats);
+
+ /* Unbind scheduler */
+ old_sched = svc->scheduler;
+ ip_vs_unbind_scheduler(svc);
+ if (old_sched)
+ ip_vs_scheduler_put(old_sched);
+
+ /* Unbind app inc */
+ if (svc->inc) {
+ ip_vs_app_inc_put(svc->inc);
+ svc->inc = NULL;
+ }
+
+ /*
+ * Unlink the whole destination list
+ */
+ l = &svc->destinations;
+ while (l->next != l) {
+ dest = list_entry(l->next, struct ip_vs_dest, n_list);
+ __ip_vs_unlink_dest(svc, dest, 0);
+ __ip_vs_del_dest(dest);
+ }
+
+ /*
+ * Update the virtual service counters
+ */
+ if (svc->port == FTPPORT)
+ atomic_dec(&ip_vs_ftpsvc_counter);
+ else if (svc->port == 0)
+ atomic_dec(&ip_vs_nullsvc_counter);
+
+ /*
+ * Free the service if nobody refers to it
+ */
+ if (atomic_read(&svc->refcnt) == 0)
+ kfree(svc);
+
+ /* decrease the module use count */
+ ip_vs_use_count_dec();
+}
+
+/*
+ * Delete a service from the service list
+ */
+static int ip_vs_del_service(struct ip_vs_service *svc)
+{
+ if (svc == NULL)
+ return -EEXIST;
+
+ /*
+ * Unhash it from the service table
+ */
+ write_lock_bh(&__ip_vs_svc_lock);
+
+ ip_vs_svc_unhash(svc);
+
+ /*
+ * Wait until all the svc users go away.
+ */
+ IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
+
+ __ip_vs_del_service(svc);
+
+ write_unlock_bh(&__ip_vs_svc_lock);
+
+ return 0;
+}
+
+
+/*
+ * Flush all the virtual services
+ */
+static int ip_vs_flush(void)
+{
+ int idx;
+ struct ip_vs_service *svc;
+ struct list_head *l;
+
+ /*
+ * Flush the service table hashed by <protocol,addr,port>
+ */
+ for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+ l = &ip_vs_svc_table[idx];
+ while (l->next != l) {
+ svc = list_entry(l->next,struct ip_vs_service,s_list);
+ write_lock_bh(&__ip_vs_svc_lock);
+ ip_vs_svc_unhash(svc);
+ /*
+ * Wait until all the svc users go away.
+ */
+ IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
+ __ip_vs_del_service(svc);
+ write_unlock_bh(&__ip_vs_svc_lock);
+ }
+ }
+
+ /*
+ * Flush the service table hashed by fwmark
+ */
+ for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+ l = &ip_vs_svc_fwm_table[idx];
+ while (l->next != l) {
+ svc = list_entry(l->next,struct ip_vs_service,f_list);
+ write_lock_bh(&__ip_vs_svc_lock);
+ ip_vs_svc_unhash(svc);
+ /*
+ * Wait until all the svc users go away.
+ */
+ IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
+ __ip_vs_del_service(svc);
+ write_unlock_bh(&__ip_vs_svc_lock);
+ }
+ }
+
+ return 0;
+}
+
+
+/*
+ * Zero counters in a service or all services
+ */
+static void
+ip_vs_zero_stats(struct ip_vs_stats *stats)
+{
+ spin_lock_bh(&stats->lock);
+ memset(stats, 0, (char *)&stats->lock - (char *)stats);
+ spin_unlock_bh(&stats->lock);
+ ip_vs_zero_estimator(stats);
+}
+
+static int ip_vs_zero_service(struct ip_vs_service *svc)
+{
+ struct list_head *l;
+ struct ip_vs_dest *dest;
+
+ write_lock_bh(&__ip_vs_svc_lock);
+ list_for_each (l, &svc->destinations) {
+ dest = list_entry(l, struct ip_vs_dest, n_list);
+ ip_vs_zero_stats(&dest->stats);
+ }
+ ip_vs_zero_stats(&svc->stats);
+ write_unlock_bh(&__ip_vs_svc_lock);
+ return 0;
+}
+
+static int ip_vs_zero_all(void)
+{
+ int idx;
+ struct list_head *l;
+ struct ip_vs_service *svc;
+
+ for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+ list_for_each (l, &ip_vs_svc_table[idx]) {
+ svc = list_entry(l, struct ip_vs_service, s_list);
+ ip_vs_zero_service(svc);
+ }
+ }
+
+ for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+ list_for_each (l, &ip_vs_svc_fwm_table[idx]) {
+ svc = list_entry(l, struct ip_vs_service, f_list);
+ ip_vs_zero_service(svc);
+ }
+ }
+
+ ip_vs_zero_stats(&ip_vs_stats);
+ return 0;
+}
+
+
+static int
+proc_do_defense_mode(ctl_table *table, int write, struct file * filp,
+ void *buffer, size_t *lenp)
+{
+ int *valp = table->data;
+ int val = *valp;
+ int rc;
+
+ rc = proc_dointvec(table, write, filp, buffer, lenp);
+ if (write && (*valp != val)) {
+ if ((*valp < 0) || (*valp > 3)) {
+ /* Restore the correct value */
+ *valp = val;
+ } else {
+ local_bh_disable();
+ update_defense_level();
+ local_bh_enable();
+ }
+ }
+ return rc;
+}
+
+
+static int
+proc_do_sync_threshold(ctl_table *table, int write, struct file *filp,
+ void *buffer, size_t *lenp)
+{
+ int *valp = table->data;
+ int val[2];
+ int rc;
+
+ /* backup the value first */
+ memcpy(val, valp, sizeof(val));
+
+ rc = proc_dointvec(table, write, filp, buffer, lenp);
+ if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) {
+ /* Restore the correct value */
+ memcpy(valp, val, sizeof(val));
+ }
+ return rc;
+}
+
+
+/*
+ * IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
+ */
+struct ip_vs_sysctl_table {
+ struct ctl_table_header *sysctl_header;
+ ctl_table vs_vars[NET_IPV4_VS_LAST];
+ ctl_table vs_dir[2];
+ ctl_table ipv4_dir[2];
+ ctl_table root_dir[2];
+};
+
+static struct ip_vs_sysctl_table ipv4_vs_table = {
+ NULL,
+ {{NET_IPV4_VS_AMEMTHRESH, "amemthresh",
+ &sysctl_ip_vs_amemthresh, sizeof(int), 0644, NULL,
+ &proc_dointvec},
+#ifdef CONFIG_IP_VS_DEBUG
+ {NET_IPV4_VS_DEBUG_LEVEL, "debug_level",
+ &sysctl_ip_vs_debug_level, sizeof(int), 0644, NULL,
+ &proc_dointvec},
+#endif
+ {NET_IPV4_VS_AMDROPRATE, "am_droprate",
+ &sysctl_ip_vs_am_droprate, sizeof(int), 0644, NULL,
+ &proc_dointvec},
+ {NET_IPV4_VS_DROP_ENTRY, "drop_entry",
+ &sysctl_ip_vs_drop_entry, sizeof(int), 0644, NULL,
+ &proc_do_defense_mode},
+ {NET_IPV4_VS_DROP_PACKET, "drop_packet",
+ &sysctl_ip_vs_drop_packet, sizeof(int), 0644, NULL,
+ &proc_do_defense_mode},
+ {NET_IPV4_VS_SECURE_TCP, "secure_tcp",
+ &sysctl_ip_vs_secure_tcp, sizeof(int), 0644, NULL,
+ &proc_do_defense_mode},
+#if 0
+ {NET_IPV4_VS_TO_ES, "timeout_established",
+ &vs_timeout_table_dos.timeout[IP_VS_S_ESTABLISHED],
+ sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
+ {NET_IPV4_VS_TO_SS, "timeout_synsent",
+ &vs_timeout_table_dos.timeout[IP_VS_S_SYN_SENT],
+ sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
+ {NET_IPV4_VS_TO_SR, "timeout_synrecv",
+ &vs_timeout_table_dos.timeout[IP_VS_S_SYN_RECV],
+ sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
+ {NET_IPV4_VS_TO_FW, "timeout_finwait",
+ &vs_timeout_table_dos.timeout[IP_VS_S_FIN_WAIT],
+ sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
+ {NET_IPV4_VS_TO_TW, "timeout_timewait",
+ &vs_timeout_table_dos.timeout[IP_VS_S_TIME_WAIT],
+ sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
+ {NET_IPV4_VS_TO_CL, "timeout_close",
+ &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE],
+ sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
+ {NET_IPV4_VS_TO_CW, "timeout_closewait",
+ &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE_WAIT],
+ sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
+ {NET_IPV4_VS_TO_LA, "timeout_lastack",
+ &vs_timeout_table_dos.timeout[IP_VS_S_LAST_ACK],
+ sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
+ {NET_IPV4_VS_TO_LI, "timeout_listen",
+ &vs_timeout_table_dos.timeout[IP_VS_S_LISTEN],
+ sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
+ {NET_IPV4_VS_TO_SA, "timeout_synack",
+ &vs_timeout_table_dos.timeout[IP_VS_S_SYNACK],
+ sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
+ {NET_IPV4_VS_TO_UDP, "timeout_udp",
+ &vs_timeout_table_dos.timeout[IP_VS_S_UDP],
+ sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
+ {NET_IPV4_VS_TO_ICMP, "timeout_icmp",
+ &vs_timeout_table_dos.timeout[IP_VS_S_ICMP],
+ sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
+#endif
+ {NET_IPV4_VS_CACHE_BYPASS, "cache_bypass",
+ &sysctl_ip_vs_cache_bypass, sizeof(int), 0644, NULL,
+ &proc_dointvec},
+ {NET_IPV4_VS_EXPIRE_NODEST_CONN, "expire_nodest_conn",
+ &sysctl_ip_vs_expire_nodest_conn, sizeof(int), 0644, NULL,
+ &proc_dointvec},
+ {NET_IPV4_VS_SYNC_THRESHOLD, "sync_threshold",
+ &sysctl_ip_vs_sync_threshold, sizeof(sysctl_ip_vs_sync_threshold),
+ 0644, NULL, &proc_do_sync_threshold},
+ {NET_IPV4_VS_NAT_ICMP_SEND, "nat_icmp_send",
+ &sysctl_ip_vs_nat_icmp_send, sizeof(int), 0644, NULL,
+ &proc_dointvec},
+ {0}},
+ {{NET_IPV4_VS, "vs", NULL, 0, 0555, ipv4_vs_table.vs_vars},
+ {0}},
+ {{NET_IPV4, "ipv4", NULL, 0, 0555, ipv4_vs_table.vs_dir},
+ {0}},
+ {{CTL_NET, "net", NULL, 0, 0555, ipv4_vs_table.ipv4_dir},
+ {0}}
+};
+
+
+/*
+ * Write the contents of the VS rule table to a PROCfs file.
+ * (It is kept just for backward compatibility)
+ */
+static inline char *ip_vs_fwd_name(unsigned flags)
+{
+ char *fwd;
+
+ switch (flags & IP_VS_CONN_F_FWD_MASK) {
+ case IP_VS_CONN_F_LOCALNODE:
+ fwd = "Local";
+ break;
+ case IP_VS_CONN_F_TUNNEL:
+ fwd = "Tunnel";
+ break;
+ case IP_VS_CONN_F_DROUTE:
+ fwd = "Route";
+ break;
+ default:
+ fwd = "Masq";
+ }
+ return fwd;
+}
+
+static inline int sprintf_dest(char *str, struct ip_vs_dest *dest)
+{
+ return sprintf(str, " -> %08X:%04X %-7s %-6d %-10d %-10d",
+ ntohl(dest->addr), ntohs(dest->port),
+ ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
+ atomic_read(&dest->weight),
+ atomic_read(&dest->activeconns),
+ atomic_read(&dest->inactconns));
+}
+
+static int ip_vs_get_info(char *buf, char **start, off_t offset, int length)
+{
+ int len=0;
+ off_t pos=0;
+ char temp[64], temp2[32];
+ int idx;
+ struct ip_vs_service *svc;
+ struct ip_vs_dest *dest;
+ struct list_head *l, *e, *p, *q;
+
+ /*
+ * Note: since the length of the buffer is usually the multiple
+ * of 512, it is good to use fixed record of the divisor of 512,
+ * so that records won't be truncated at buffer boundary.
+ */
+ pos = 192;
+ if (pos > offset) {
+ sprintf(temp,
+ "IP Virtual Server version %d.%d.%d (size=%d)",
+ NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
+ len += sprintf(buf+len, "%-63s\n", temp);
+ len += sprintf(buf+len, "%-63s\n",
+ "Prot LocalAddress:Port Scheduler Flags");
+ len += sprintf(buf+len, "%-63s\n",
+ " -> RemoteAddress:Port Forward Weight ActiveConn InActConn");
+ }
+
+ read_lock_bh(&__ip_vs_svc_lock);
+
+ /* print the service table hashed by <protocol,addr,port> */
+ for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+ l = &ip_vs_svc_table[idx];
+ for (e=l->next; e!=l; e=e->next) {
+ svc = list_entry(e, struct ip_vs_service, s_list);
+ pos += 64;
+ if (pos > offset) {
+ if (svc->flags & IP_VS_SVC_F_PERSISTENT)
+ sprintf(temp2, "persistent %d %08X",
+ svc->timeout,
+ ntohl(svc->netmask));
+ else
+ temp2[0] = '\0';
+
+ sprintf(temp, "%s %08X:%04X %s %s",
+ ip_vs_proto_name(svc->protocol),
+ ntohl(svc->addr),
+ ntohs(svc->port),
+ svc->scheduler->name, temp2);
+ len += sprintf(buf+len, "%-63s\n", temp);
+ if (len >= length)
+ goto done;
+ }
+
+ p = &svc->destinations;
+ for (q=p->next; q!=p; q=q->next) {
+ dest = list_entry(q, struct ip_vs_dest, n_list);
+ pos += 64;
+ if (pos <= offset)
+ continue;
+ sprintf_dest(temp, dest);
+ len += sprintf(buf+len, "%-63s\n", temp);
+ if (len >= length)
+ goto done;
+ }
+ }
+ }
+
+ /* print the service table hashed by fwmark */
+ for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+ l = &ip_vs_svc_fwm_table[idx];
+ for (e=l->next; e!=l; e=e->next) {
+ svc = list_entry(e, struct ip_vs_service, f_list);
+ pos += 64;
+ if (pos > offset) {
+ if (svc->flags & IP_VS_SVC_F_PERSISTENT)
+ sprintf(temp2, "persistent %d %08X",
+ svc->timeout,
+ ntohl(svc->netmask));
+ else
+ temp2[0] = '\0';
+
+ sprintf(temp, "FWM %08X %s %s",
+ svc->fwmark,
+ svc->scheduler->name, temp2);
+ len += sprintf(buf+len, "%-63s\n", temp);
+ if (len >= length)
+ goto done;
+ }
+
+ p = &svc->destinations;
+ for (q=p->next; q!=p; q=q->next) {
+ dest = list_entry(q, struct ip_vs_dest, n_list);
+ pos += 64;
+ if (pos <= offset)
+ continue;
+ sprintf_dest(temp, dest);
+ len += sprintf(buf+len, "%-63s\n", temp);
+ if (len >= length)
+ goto done;
+ }
+ }
+ }
+
+ done:
+ read_unlock_bh(&__ip_vs_svc_lock);
+
+ *start = buf+len-(pos-offset); /* Start of wanted data */
+ len = pos-offset;
+ if (len > length)
+ len = length;
+ if (len < 0)
+ len = 0;
+ return len;
+}
+
+
+struct ip_vs_stats ip_vs_stats;
+
+static int
+ip_vs_stats_get_info(char *buf, char **start, off_t offset, int length)
+{
+ int len=0;
+ off_t pos=0;
+ char temp[64];
+
+ pos += 320;
+ if (pos > offset) {
+ len += sprintf(buf+len, "%-63s\n%-63s\n",
+/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
+ " Total Incoming Outgoing Incoming Outgoing",
+ " Conns Packets Packets Bytes Bytes");
+
+ spin_lock_bh(&ip_vs_stats.lock);
+ sprintf(temp, "%8X %8X %8X %8X%08X %8X%08X",
+ ip_vs_stats.conns,
+ ip_vs_stats.inpkts,
+ ip_vs_stats.outpkts,
+ (__u32)(ip_vs_stats.inbytes>>32),
+ (__u32)ip_vs_stats.inbytes,
+ (__u32)(ip_vs_stats.outbytes>>32),
+ (__u32)ip_vs_stats.outbytes);
+ len += sprintf(buf+len, "%-62s\n\n", temp);
+
+ len += sprintf(buf+len, "%-63s\n",
+/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
+ " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s");
+ sprintf(temp, "%8X %8X %8X %16X %16X",
+ ip_vs_stats.cps,
+ ip_vs_stats.inpps,
+ ip_vs_stats.outpps,
+ ip_vs_stats.inbps,
+ ip_vs_stats.outbps);
+ len += sprintf(buf+len, "%-63s\n", temp);
+
+ spin_unlock_bh(&ip_vs_stats.lock);
+ }
+
+ *start = buf+len-(pos-offset); /* Start of wanted data */
+ len = pos-offset;
+ if (len > length)
+ len = length;
+ if (len < 0)
+ len = 0;
+ return len;
+}
+
+
+/*
+ * Set timeout values for tcp tcpfin udp in the timeout_table.
+ */
+static int ip_vs_set_timeout(struct ip_vs_timeout_user *u)
+{
+ IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n",
+ u->tcp_timeout,
+ u->tcp_fin_timeout,
+ u->udp_timeout);
+
+#ifdef CONFIG_IP_VS_PROTO_TCP
+ if (u->tcp_timeout) {
+ ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED]
+ = u->tcp_timeout * HZ;
+ }
+
+ if (u->tcp_fin_timeout) {
+ ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT]
+ = u->tcp_fin_timeout * HZ;
+ }
+#endif
+
+#ifdef CONFIG_IP_VS_PROTO_UDP
+ if (u->udp_timeout) {
+ ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL]
+ = u->udp_timeout * HZ;
+ }
+#endif
+ return 0;
+}
+
+
+#define SET_CMDID(cmd) (cmd - IP_VS_BASE_CTL)
+#define SERVICE_ARG_LEN (sizeof(struct ip_vs_service_user))
+#define SVCDEST_ARG_LEN (sizeof(struct ip_vs_service_user) + \
+ sizeof(struct ip_vs_dest_user))
+#define TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user))
+#define DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user))
+#define MAX_ARG_LEN SVCDEST_ARG_LEN
+
+static unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = {
+ [SET_CMDID(IP_VS_SO_SET_ADD)] = SERVICE_ARG_LEN,
+ [SET_CMDID(IP_VS_SO_SET_EDIT)] = SERVICE_ARG_LEN,
+ [SET_CMDID(IP_VS_SO_SET_DEL)] = SERVICE_ARG_LEN,
+ [SET_CMDID(IP_VS_SO_SET_FLUSH)] = 0,
+ [SET_CMDID(IP_VS_SO_SET_ADDDEST)] = SVCDEST_ARG_LEN,
+ [SET_CMDID(IP_VS_SO_SET_DELDEST)] = SVCDEST_ARG_LEN,
+ [SET_CMDID(IP_VS_SO_SET_EDITDEST)] = SVCDEST_ARG_LEN,
+ [SET_CMDID(IP_VS_SO_SET_TIMEOUT)] = TIMEOUT_ARG_LEN,
+ [SET_CMDID(IP_VS_SO_SET_STARTDAEMON)] = DAEMON_ARG_LEN,
+ [SET_CMDID(IP_VS_SO_SET_STOPDAEMON)] = DAEMON_ARG_LEN,
+ [SET_CMDID(IP_VS_SO_SET_ZERO)] = SERVICE_ARG_LEN,
+};
+
+static int
+do_ip_vs_set_ctl(struct sock *sk, int cmd, void *user, unsigned int len)
+{
+ int ret;
+ unsigned char arg[MAX_ARG_LEN];
+ struct ip_vs_service_user *usvc;
+ struct ip_vs_service *svc;
+ struct ip_vs_dest_user *udest;
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (len != set_arglen[SET_CMDID(cmd)]) {
+ IP_VS_ERR("set_ctl: len %u != %u\n",
+ len, set_arglen[SET_CMDID(cmd)]);
+ return -EINVAL;
+ }
+
+ if (copy_from_user(arg, user, len) != 0)
+ return -EFAULT;
+
+ /* increase the module use count */
+ ip_vs_use_count_inc();
+
+ if (down_interruptible(&__ip_vs_mutex)) {
+ ret = -ERESTARTSYS;
+ goto out_dec;
+ }
+
+ if (cmd == IP_VS_SO_SET_FLUSH) {
+ /* Flush the virtual service */
+ ret = ip_vs_flush();
+ goto out_unlock;
+ } else if (cmd == IP_VS_SO_SET_TIMEOUT) {
+ /* Set timeout values for (tcp tcpfin udp) */
+ ret = ip_vs_set_timeout((struct ip_vs_timeout_user *)arg);
+ goto out_unlock;
+ } else if (cmd == IP_VS_SO_SET_STARTDAEMON) {
+ struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
+ ret = start_sync_thread(dm->state, dm->mcast_ifn, dm->syncid);
+ goto out_unlock;
+ } else if (cmd == IP_VS_SO_SET_STOPDAEMON) {
+ struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
+ ret = stop_sync_thread(dm->state);
+ goto out_unlock;
+ }
+
+ usvc = (struct ip_vs_service_user *)arg;
+ udest = (struct ip_vs_dest_user *)(usvc + 1);
+
+ if (cmd == IP_VS_SO_SET_ZERO) {
+ /* if no service address is set, zero counters in all */
+ if (!usvc->fwmark && !usvc->addr && !usvc->port) {
+ ret = ip_vs_zero_all();
+ goto out_unlock;
+ }
+ }
+
+ /* Check for valid protocol: TCP or UDP, even for fwmark!=0 */
+ if (usvc->protocol!=IPPROTO_TCP && usvc->protocol!=IPPROTO_UDP) {
+ IP_VS_INFO("vs_ctl: invalid protocol: %d %d.%d.%d.%d:%d %s",
+ ntohs(usvc->protocol), NIPQUAD(usvc->addr),
+ ntohs(usvc->port), usvc->sched_name);
+ ret = -EFAULT;
+ goto out_unlock;
+ }
+
+ /* Lookup the exact service by <protocol, addr, port> or fwmark */
+ if (usvc->fwmark == 0)
+ svc = __ip_vs_service_get(usvc->protocol,
+ usvc->addr, usvc->port);
+ else
+ svc = __ip_vs_svc_fwm_get(usvc->fwmark);
+
+ if (cmd != IP_VS_SO_SET_ADD
+ && (svc == NULL || svc->protocol != usvc->protocol)) {
+ ret = -ESRCH;
+ goto out_unlock;
+ }
+
+ switch (cmd) {
+ case IP_VS_SO_SET_ADD:
+ if (svc != NULL)
+ ret = -EEXIST;
+ else
+ ret = ip_vs_add_service(usvc, &svc);
+ break;
+ case IP_VS_SO_SET_EDIT:
+ ret = ip_vs_edit_service(svc, usvc);
+ break;
+ case IP_VS_SO_SET_DEL:
+ ret = ip_vs_del_service(svc);
+ if (!ret)
+ goto out_unlock;
+ break;
+ case IP_VS_SO_SET_ZERO:
+ ret = ip_vs_zero_service(svc);
+ break;
+ case IP_VS_SO_SET_ADDDEST:
+ ret = ip_vs_add_dest(svc, udest);
+ break;
+ case IP_VS_SO_SET_EDITDEST:
+ ret = ip_vs_edit_dest(svc, udest);
+ break;
+ case IP_VS_SO_SET_DELDEST:
+ ret = ip_vs_del_dest(svc, udest);
+ break;
+ default:
+ ret = -EINVAL;
+ }
+
+ if (svc)
+ ip_vs_service_put(svc);
+
+ out_unlock:
+ up(&__ip_vs_mutex);
+ out_dec:
+ /* decrease the module use count */
+ ip_vs_use_count_dec();
+
+ return ret;
+}
+
+
+static void
+ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src)
+{
+ spin_lock_bh(&src->lock);
+ memcpy(dst, src, (char*)&src->lock - (char*)src);
+ spin_unlock_bh(&src->lock);
+}
+
+static void
+ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
+{
+ dst->protocol = src->protocol;
+ dst->addr = src->addr;
+ dst->port = src->port;
+ dst->fwmark = src->fwmark;
+ strcpy(dst->sched_name, src->scheduler->name);
+ dst->flags = src->flags;
+ dst->timeout = src->timeout / HZ;
+ dst->netmask = src->netmask;
+ dst->num_dests = src->num_dests;
+ ip_vs_copy_stats(&dst->stats, &src->stats);
+}
+
+static inline int
+__ip_vs_get_service_entries(const struct ip_vs_get_services *get,
+ struct ip_vs_get_services *uptr)
+{
+ int idx, count=0;
+ struct ip_vs_service *svc;
+ struct list_head *l;
+ struct ip_vs_service_entry entry;
+ int ret = 0;
+
+ for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+ list_for_each (l, &ip_vs_svc_table[idx]) {
+ if (count >= get->num_services)
+ goto out;
+ svc = list_entry(l, struct ip_vs_service, s_list);
+ ip_vs_copy_service(&entry, svc);
+ if (copy_to_user(&uptr->entrytable[count],
+ &entry, sizeof(entry))) {
+ ret = -EFAULT;
+ goto out;
+ }
+ count++;
+ }
+ }
+
+ for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+ list_for_each (l, &ip_vs_svc_fwm_table[idx]) {
+ if (count >= get->num_services)
+ goto out;
+ svc = list_entry(l, struct ip_vs_service, f_list);
+ ip_vs_copy_service(&entry, svc);
+ if (copy_to_user(&uptr->entrytable[count],
+ &entry, sizeof(entry))) {
+ ret = -EFAULT;
+ goto out;
+ }
+ count++;
+ }
+ }
+ out:
+ return ret;
+}
+
+static inline int
+__ip_vs_get_dest_entries(const struct ip_vs_get_dests *get,
+ struct ip_vs_get_dests *uptr)
+{
+ struct ip_vs_service *svc;
+ int ret = 0;
+
+ if (get->fwmark)
+ svc = __ip_vs_svc_fwm_get(get->fwmark);
+ else
+ svc = __ip_vs_service_get(get->protocol,
+ get->addr, get->port);
+ if (svc) {
+ int count = 0;
+ struct ip_vs_dest *dest;
+ struct list_head *l, *e;
+ struct ip_vs_dest_entry entry;
+
+ l = &svc->destinations;
+ for (e=l->next; e!=l; e=e->next) {
+ if (count >= get->num_dests)
+ break;
+ dest = list_entry(e, struct ip_vs_dest, n_list);
+ entry.addr = dest->addr;
+ entry.port = dest->port;
+ entry.conn_flags = atomic_read(&dest->conn_flags);
+ entry.weight = atomic_read(&dest->weight);
+ entry.u_threshold = dest->u_threshold;
+ entry.l_threshold = dest->l_threshold;
+ entry.activeconns = atomic_read(&dest->activeconns);
+ entry.inactconns = atomic_read(&dest->inactconns);
+ entry.persistconns = atomic_read(&dest->persistconns);
+ ip_vs_copy_stats(&entry.stats, &dest->stats);
+ if (copy_to_user(&uptr->entrytable[count],
+ &entry, sizeof(entry))) {
+ ret = -EFAULT;
+ break;
+ }
+ count++;
+ }
+ ip_vs_service_put(svc);
+ } else
+ ret = -ESRCH;
+ return ret;
+}
+
+static inline void
+__ip_vs_get_timeouts(struct ip_vs_timeout_user *u)
+{
+#ifdef CONFIG_IP_VS_PROTO_TCP
+ u->tcp_timeout =
+ ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ;
+ u->tcp_fin_timeout =
+ ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;
+#endif
+#ifdef CONFIG_IP_VS_PROTO_UDP
+ u->udp_timeout =
+ ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] / HZ;
+#endif
+}
+
+
+#define GET_CMDID(cmd) (cmd - IP_VS_BASE_CTL)
+#define GET_INFO_ARG_LEN (sizeof(struct ip_vs_getinfo))
+#define GET_SERVICES_ARG_LEN (sizeof(struct ip_vs_get_services))
+#define GET_SERVICE_ARG_LEN (sizeof(struct ip_vs_service_entry))
+#define GET_DESTS_ARG_LEN (sizeof(struct ip_vs_get_dests))
+#define GET_TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user))
+#define GET_DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user) * 2)
+
+static unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = {
+ [GET_CMDID(IP_VS_SO_GET_VERSION)] = 64,
+ [GET_CMDID(IP_VS_SO_GET_INFO)] = GET_INFO_ARG_LEN,
+ [GET_CMDID(IP_VS_SO_GET_SERVICES)] = GET_SERVICES_ARG_LEN,
+ [GET_CMDID(IP_VS_SO_GET_SERVICE)] = GET_SERVICE_ARG_LEN,
+ [GET_CMDID(IP_VS_SO_GET_DESTS)] = GET_DESTS_ARG_LEN,
+ [GET_CMDID(IP_VS_SO_GET_TIMEOUT)] = GET_TIMEOUT_ARG_LEN,
+ [GET_CMDID(IP_VS_SO_GET_DAEMON)] = GET_DAEMON_ARG_LEN,
+};
+
+static int
+do_ip_vs_get_ctl(struct sock *sk, int cmd, void *user, int *len)
+{
+ unsigned char arg[128];
+ int ret = 0;
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (*len < get_arglen[GET_CMDID(cmd)]) {
+ IP_VS_ERR("get_ctl: len %u < %u\n",
+ *len, get_arglen[GET_CMDID(cmd)]);
+ return -EINVAL;
+ }
+
+ if (copy_from_user(arg, user, get_arglen[GET_CMDID(cmd)]) != 0)
+ return -EFAULT;
+
+ if (down_interruptible(&__ip_vs_mutex))
+ return -ERESTARTSYS;
+
+ switch (cmd) {
+ case IP_VS_SO_GET_VERSION:
+ {
+ char buf[64];
+
+ sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)",
+ NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
+ if (copy_to_user(user, buf, strlen(buf)+1) != 0) {
+ ret = -EFAULT;
+ goto out;
+ }
+ *len = strlen(buf)+1;
+ }
+ break;
+
+ case IP_VS_SO_GET_INFO:
+ {
+ struct ip_vs_getinfo info;
+ info.version = IP_VS_VERSION_CODE;
+ info.size = IP_VS_CONN_TAB_SIZE;
+ info.num_services = ip_vs_num_services;
+ if (copy_to_user(user, &info, sizeof(info)) != 0)
+ ret = -EFAULT;
+ }
+ break;
+
+ case IP_VS_SO_GET_SERVICES:
+ {
+ struct ip_vs_get_services *get;
+ int size;
+
+ get = (struct ip_vs_get_services *)arg;
+ size = sizeof(*get) +
+ sizeof(struct ip_vs_service_entry) * get->num_services;
+ if (*len != size) {
+ IP_VS_ERR("length: %u != %u\n", *len, size);
+ ret = -EINVAL;
+ goto out;
+ }
+ ret = __ip_vs_get_service_entries(get, user);
+ }
+ break;
+
+ case IP_VS_SO_GET_SERVICE:
+ {
+ struct ip_vs_service_entry *entry;
+ struct ip_vs_service *svc;
+
+ entry = (struct ip_vs_service_entry *)arg;
+ if (entry->fwmark)
+ svc = __ip_vs_svc_fwm_get(entry->fwmark);
+ else
+ svc = __ip_vs_service_get(entry->protocol,
+ entry->addr, entry->port);
+ if (svc) {
+ ip_vs_copy_service(entry, svc);
+ if (copy_to_user(user, entry, sizeof(*entry)) != 0)
+ ret = -EFAULT;
+ ip_vs_service_put(svc);
+ } else
+ ret = -ESRCH;
+ }
+ break;
+
+ case IP_VS_SO_GET_DESTS:
+ {
+ struct ip_vs_get_dests *get;
+ int size;
+
+ get = (struct ip_vs_get_dests *)arg;
+ size = sizeof(*get) +
+ sizeof(struct ip_vs_dest_entry) * get->num_dests;
+ if (*len != size) {
+ IP_VS_ERR("length: %u != %u\n", *len, size);
+ ret = -EINVAL;
+ goto out;
+ }
+ ret = __ip_vs_get_dest_entries(get, user);
+ }
+ break;
+
+ case IP_VS_SO_GET_TIMEOUT:
+ {
+ struct ip_vs_timeout_user t;
+
+ __ip_vs_get_timeouts(&t);
+ if (copy_to_user(user, &t, sizeof(t)) != 0)
+ ret = -EFAULT;
+ }
+ break;
+
+ case IP_VS_SO_GET_DAEMON:
+ {
+ struct ip_vs_daemon_user d[2];
+
+ memset(&d, 0, sizeof(d));
+ if (ip_vs_sync_state & IP_VS_STATE_MASTER) {
+ d[0].state = IP_VS_STATE_MASTER;
+ strcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn);
+ d[0].syncid = ip_vs_master_syncid;
+ }
+ if (ip_vs_sync_state & IP_VS_STATE_BACKUP) {
+ d[1].state = IP_VS_STATE_BACKUP;
+ strcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn);
+ d[1].syncid = ip_vs_backup_syncid;
+ }
+ if (copy_to_user(user, &d, sizeof(d)) != 0)
+ ret = -EFAULT;
+ }
+ break;
+
+ default:
+ ret = -EINVAL;
+ }
+
+ out:
+ up(&__ip_vs_mutex);
+ return ret;
+}
+
+
+static struct nf_sockopt_ops ip_vs_sockopts = {
+ { NULL, NULL }, PF_INET,
+ IP_VS_BASE_CTL, IP_VS_SO_SET_MAX+1, do_ip_vs_set_ctl,
+ IP_VS_BASE_CTL, IP_VS_SO_GET_MAX+1, do_ip_vs_get_ctl
+};
+
+
+int ip_vs_control_init(void)
+{
+ int ret;
+ int idx;
+
+ EnterFunction(2);
+
+ ret = nf_register_sockopt(&ip_vs_sockopts);
+ if (ret) {
+ IP_VS_ERR("cannot register sockopt.\n");
+ return ret;
+ }
+
+ proc_net_create("ip_vs", 0, ip_vs_get_info);
+ proc_net_create("ip_vs_stats", 0, ip_vs_stats_get_info);
+
+ ipv4_vs_table.sysctl_header =
+ register_sysctl_table(ipv4_vs_table.root_dir, 0);
+
+ /* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */
+ for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+ INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
+ INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
+ }
+ for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++) {
+ INIT_LIST_HEAD(&ip_vs_rtable[idx]);
+ }
+
+ memset(&ip_vs_stats, 0, sizeof(ip_vs_stats));
+ ip_vs_stats.lock = SPIN_LOCK_UNLOCKED;
+ ip_vs_new_estimator(&ip_vs_stats);
+
+ LeaveFunction(2);
+ return 0;
+}
+
+
+void ip_vs_control_cleanup(void)
+{
+ EnterFunction(2);
+ ip_vs_trash_cleanup();
+ ip_vs_kill_estimator(&ip_vs_stats);
+ unregister_sysctl_table(ipv4_vs_table.sysctl_header);
+ proc_net_remove("ip_vs_stats");
+ proc_net_remove("ip_vs");
+ nf_unregister_sockopt(&ip_vs_sockopts);
+ LeaveFunction(2);
+}
--- /dev/null
+/*
+ * IPVS: Destination Hashing scheduling module
+ *
+ * Version: $Id: ip_vs_dh.c,v 1.5 2002/09/15 08:14:08 wensong Exp $
+ *
+ * Authors: Wensong Zhang <wensong@gnuchina.org>
+ *
+ * Inspired by the consistent hashing scheduler patch from
+ * Thomas Proell <proellt@gmx.de>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+
+/*
+ * The dh algorithm is to select server by the hash key of destination IP
+ * address. The pseudo code is as follows:
+ *
+ * n <- servernode[dest_ip];
+ * if (n is dead) OR
+ * (n is overloaded) OR (n.weight <= 0) then
+ * return NULL;
+ *
+ * return n;
+ *
+ * Notes that servernode is a 256-bucket hash table that maps the hash
+ * index derived from packet destination IP address to the current server
+ * array. If the dh scheduler is used in cache cluster, it is good to
+ * combine it with cache_bypass feature. When the statically assigned
+ * server is dead or overloaded, the load balancer can bypass the cache
+ * server and send requests to the original server directly.
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+
+#include <net/ip_vs.h>
+
+
+/*
+ * IPVS DH bucket
+ */
+struct ip_vs_dh_bucket {
+ struct ip_vs_dest *dest; /* real server (cache) */
+};
+
+/*
+ * for IPVS DH entry hash table
+ */
+#ifndef CONFIG_IP_VS_DH_TAB_BITS
+#define CONFIG_IP_VS_DH_TAB_BITS 8
+#endif
+#define IP_VS_DH_TAB_BITS CONFIG_IP_VS_DH_TAB_BITS
+#define IP_VS_DH_TAB_SIZE (1 << IP_VS_DH_TAB_BITS)
+#define IP_VS_DH_TAB_MASK (IP_VS_DH_TAB_SIZE - 1)
+
+
+/*
+ * Returns hash value for IPVS DH entry
+ */
+static inline unsigned ip_vs_dh_hashkey(__u32 addr)
+{
+ return (ntohl(addr)*2654435761UL) & IP_VS_DH_TAB_MASK;
+}
+
+
+/*
+ * Get ip_vs_dest associated with supplied parameters.
+ */
+static inline struct ip_vs_dest *
+ip_vs_dh_get(struct ip_vs_dh_bucket *tbl, __u32 addr)
+{
+ return (tbl[ip_vs_dh_hashkey(addr)]).dest;
+}
+
+
+/*
+ * Assign all the hash buckets of the specified table with the service.
+ */
+static int
+ip_vs_dh_assign(struct ip_vs_dh_bucket *tbl, struct ip_vs_service *svc)
+{
+ int i;
+ struct ip_vs_dh_bucket *b;
+ struct list_head *p;
+ struct ip_vs_dest *dest;
+
+ b = tbl;
+ p = &svc->destinations;
+ for (i=0; i<IP_VS_DH_TAB_SIZE; i++) {
+ if (list_empty(p)) {
+ b->dest = NULL;
+ } else {
+ if (p == &svc->destinations)
+ p = p->next;
+
+ dest = list_entry(p, struct ip_vs_dest, n_list);
+ atomic_inc(&dest->refcnt);
+ b->dest = dest;
+
+ p = p->next;
+ }
+ b++;
+ }
+ return 0;
+}
+
+
+/*
+ * Flush all the hash buckets of the specified table.
+ */
+static void ip_vs_dh_flush(struct ip_vs_dh_bucket *tbl)
+{
+ int i;
+ struct ip_vs_dh_bucket *b;
+
+ b = tbl;
+ for (i=0; i<IP_VS_DH_TAB_SIZE; i++) {
+ if (b->dest) {
+ atomic_dec(&b->dest->refcnt);
+ b->dest = NULL;
+ }
+ b++;
+ }
+}
+
+
+static int ip_vs_dh_init_svc(struct ip_vs_service *svc)
+{
+ struct ip_vs_dh_bucket *tbl;
+
+ /* allocate the DH table for this service */
+ tbl = kmalloc(sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE,
+ GFP_ATOMIC);
+ if (tbl == NULL) {
+ IP_VS_ERR("ip_vs_dh_init_svc(): no memory\n");
+ return -ENOMEM;
+ }
+ svc->sched_data = tbl;
+ IP_VS_DBG(6, "DH hash table (memory=%dbytes) allocated for "
+ "current service\n",
+ sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE);
+
+ /* assign the hash buckets with the updated service */
+ ip_vs_dh_assign(tbl, svc);
+
+ return 0;
+}
+
+
+static int ip_vs_dh_done_svc(struct ip_vs_service *svc)
+{
+ struct ip_vs_dh_bucket *tbl = svc->sched_data;
+
+ /* got to clean up hash buckets here */
+ ip_vs_dh_flush(tbl);
+
+ /* release the table itself */
+ kfree(svc->sched_data);
+ IP_VS_DBG(6, "DH hash table (memory=%dbytes) released\n",
+ sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE);
+
+ return 0;
+}
+
+
+static int ip_vs_dh_update_svc(struct ip_vs_service *svc)
+{
+ struct ip_vs_dh_bucket *tbl = svc->sched_data;
+
+ /* got to clean up hash buckets here */
+ ip_vs_dh_flush(tbl);
+
+ /* assign the hash buckets with the updated service */
+ ip_vs_dh_assign(tbl, svc);
+
+ return 0;
+}
+
+
+/*
+ * If the dest flags is set with IP_VS_DEST_F_OVERLOAD,
+ * consider that the server is overloaded here.
+ */
+static inline int is_overloaded(struct ip_vs_dest *dest)
+{
+ return dest->flags & IP_VS_DEST_F_OVERLOAD;
+}
+
+
+/*
+ * Destination hashing scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_dh_schedule(struct ip_vs_service *svc, struct iphdr *iph)
+{
+ struct ip_vs_dest *dest;
+ struct ip_vs_dh_bucket *tbl;
+
+ IP_VS_DBG(6, "ip_vs_dh_schedule(): Scheduling...\n");
+
+ tbl = (struct ip_vs_dh_bucket *)svc->sched_data;
+ dest = ip_vs_dh_get(tbl, iph->daddr);
+ if (!dest
+ || !(dest->flags & IP_VS_DEST_F_AVAILABLE)
+ || atomic_read(&dest->weight) <= 0
+ || is_overloaded(dest)) {
+ return NULL;
+ }
+
+ IP_VS_DBG(6, "DH: destination IP address %u.%u.%u.%u "
+ "--> server %u.%u.%u.%u:%d\n",
+ NIPQUAD(iph->daddr),
+ NIPQUAD(dest->addr),
+ ntohs(dest->port));
+
+ return dest;
+}
+
+
+/*
+ * IPVS DH Scheduler structure
+ */
+static struct ip_vs_scheduler ip_vs_dh_scheduler =
+{
+ .name = "dh",
+ .refcnt = ATOMIC_INIT(0),
+ .module = THIS_MODULE,
+ .init_service = ip_vs_dh_init_svc,
+ .done_service = ip_vs_dh_done_svc,
+ .update_service = ip_vs_dh_update_svc,
+ .schedule = ip_vs_dh_schedule,
+};
+
+
+static int __init ip_vs_dh_init(void)
+{
+ INIT_LIST_HEAD(&ip_vs_dh_scheduler.n_list);
+ return register_ip_vs_scheduler(&ip_vs_dh_scheduler);
+}
+
+
+static void __exit ip_vs_dh_cleanup(void)
+{
+ unregister_ip_vs_scheduler(&ip_vs_dh_scheduler);
+}
+
+
+module_init(ip_vs_dh_init);
+module_exit(ip_vs_dh_cleanup);
+MODULE_LICENSE("GPL");
--- /dev/null
+/*
+ * ip_vs_est.c: simple rate estimator for IPVS
+ *
+ * Version: $Id: ip_vs_est.c,v 1.4 2002/11/30 01:50:35 wensong Exp $
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/types.h>
+
+#include <net/ip_vs.h>
+
+/*
+ This code is to estimate rate in a shorter interval (such as 8
+ seconds) for virtual services and real servers. For measure rate in a
+ long interval, it is easy to implement a user level daemon which
+ periodically reads those statistical counters and measure rate.
+
+ Currently, the measurement is activated by slow timer handler. Hope
+ this measurement will not introduce too much load.
+
+ We measure rate during the last 8 seconds every 2 seconds:
+
+ avgrate = avgrate*(1-W) + rate*W
+
+ where W = 2^(-2)
+
+ NOTES.
+
+ * The stored value for average bps is scaled by 2^5, so that maximal
+ rate is ~2.15Gbits/s, average pps and cps are scaled by 2^10.
+
+ * A lot code is taken from net/sched/estimator.c
+ */
+
+
+struct ip_vs_estimator
+{
+ struct ip_vs_estimator *next;
+ struct ip_vs_stats *stats;
+
+ u32 last_conns;
+ u32 last_inpkts;
+ u32 last_outpkts;
+ u64 last_inbytes;
+ u64 last_outbytes;
+
+ u32 cps;
+ u32 inpps;
+ u32 outpps;
+ u32 inbps;
+ u32 outbps;
+};
+
+
+static struct ip_vs_estimator *est_list = NULL;
+static rwlock_t est_lock = RW_LOCK_UNLOCKED;
+static struct timer_list est_timer;
+
+static void estimation_timer(unsigned long arg)
+{
+ struct ip_vs_estimator *e;
+ struct ip_vs_stats *s;
+ u32 n_conns;
+ u32 n_inpkts, n_outpkts;
+ u64 n_inbytes, n_outbytes;
+ u32 rate;
+
+ read_lock(&est_lock);
+ for (e = est_list; e; e = e->next) {
+ s = e->stats;
+ n_conns = s->conns;
+ n_inpkts = s->inpkts;
+ n_outpkts = s->outpkts;
+ n_inbytes = s->inbytes;
+ n_outbytes = s->outbytes;
+
+ /* scaled by 2^10, but divided 2 seconds */
+ rate = (n_conns - e->last_conns)<<9;
+ e->last_conns = n_conns;
+ e->cps += ((long)rate - (long)e->cps)>>2;
+ s->cps = (e->cps+0x1FF)>>10;
+
+ rate = (n_inpkts - e->last_inpkts)<<9;
+ e->last_inpkts = n_inpkts;
+ e->inpps += ((long)rate - (long)e->inpps)>>2;
+ s->inpps = (e->inpps+0x1FF)>>10;
+
+ rate = (n_outpkts - e->last_outpkts)<<9;
+ e->last_outpkts = n_outpkts;
+ e->outpps += ((long)rate - (long)e->outpps)>>2;
+ s->outpps = (e->outpps+0x1FF)>>10;
+
+ rate = (n_inbytes - e->last_inbytes)<<4;
+ e->last_inbytes = n_inbytes;
+ e->inbps += ((long)rate - (long)e->inbps)>>2;
+ s->inbps = (e->inbps+0xF)>>5;
+
+ rate = (n_outbytes - e->last_outbytes)<<4;
+ e->last_outbytes = n_outbytes;
+ e->outbps += ((long)rate - (long)e->outbps)>>2;
+ s->outbps = (e->outbps+0xF)>>5;
+ }
+ read_unlock(&est_lock);
+ mod_timer(&est_timer, jiffies + 2*HZ);
+}
+
+int ip_vs_new_estimator(struct ip_vs_stats *stats)
+{
+ struct ip_vs_estimator *est;
+
+ est = kmalloc(sizeof(*est), GFP_KERNEL);
+ if (est == NULL)
+ return -ENOMEM;
+
+ memset(est, 0, sizeof(*est));
+ est->stats = stats;
+ est->last_conns = stats->conns;
+ est->cps = stats->cps<<10;
+
+ est->last_inpkts = stats->inpkts;
+ est->inpps = stats->inpps<<10;
+
+ est->last_outpkts = stats->outpkts;
+ est->outpps = stats->outpps<<10;
+
+ est->last_inbytes = stats->inbytes;
+ est->inbps = stats->inbps<<5;
+
+ est->last_outbytes = stats->outbytes;
+ est->outbps = stats->outbps<<5;
+
+ write_lock_bh(&est_lock);
+ est->next = est_list;
+ if (est->next == NULL) {
+ init_timer(&est_timer);
+ est_timer.expires = jiffies + 2*HZ;
+ est_timer.function = estimation_timer;
+ add_timer(&est_timer);
+ }
+ est_list = est;
+ write_unlock_bh(&est_lock);
+ return 0;
+}
+
+void ip_vs_kill_estimator(struct ip_vs_stats *stats)
+{
+ struct ip_vs_estimator *est, **pest;
+ int killed = 0;
+
+ write_lock_bh(&est_lock);
+ pest = &est_list;
+ while ((est=*pest) != NULL) {
+ if (est->stats != stats) {
+ pest = &est->next;
+ continue;
+ }
+ *pest = est->next;
+ kfree(est);
+ killed++;
+ }
+ if (killed && est_list == NULL)
+ del_timer_sync(&est_timer);
+ write_unlock_bh(&est_lock);
+}
+
+void ip_vs_zero_estimator(struct ip_vs_stats *stats)
+{
+ struct ip_vs_estimator *e;
+
+ write_lock_bh(&est_lock);
+ for (e = est_list; e; e = e->next) {
+ if (e->stats != stats)
+ continue;
+
+ /* set counters zero */
+ e->last_conns = 0;
+ e->last_inpkts = 0;
+ e->last_outpkts = 0;
+ e->last_inbytes = 0;
+ e->last_outbytes = 0;
+ e->cps = 0;
+ e->inpps = 0;
+ e->outpps = 0;
+ e->inbps = 0;
+ e->outbps = 0;
+ }
+ write_unlock_bh(&est_lock);
+}
--- /dev/null
+/*
+ * ip_vs_ftp.c: IPVS ftp application module
+ *
+ * Version: $Id: ip_vs_ftp.c,v 1.13 2002/09/15 08:14:08 wensong Exp $
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ * Changes:
+ *
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Most code here is taken from ip_masq_ftp.c in kernel 2.2. The difference
+ * is that ip_vs_ftp module handles the reverse direction to ip_masq_ftp.
+ *
+ * IP_MASQ_FTP ftp masquerading module
+ *
+ * Version: @(#)ip_masq_ftp.c 0.04 02/05/96
+ *
+ * Author: Wouter Gadeyne
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <asm/system.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/init.h>
+#include <net/protocol.h>
+#include <net/tcp.h>
+
+#include <net/ip_vs.h>
+
+
+#define SERVER_STRING "227 Entering Passive Mode ("
+#define CLIENT_STRING "PORT "
+
+
+/*
+ * List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper
+ * First port is set to the default port.
+ */
+static int ports[IP_VS_APP_MAX_PORTS] = {21, 0};
+
+/*
+ * Debug level
+ */
+#ifdef CONFIG_IP_VS_DEBUG
+static int debug=0;
+MODULE_PARM(debug, "i");
+#endif
+
+MODULE_PARM(ports, "1-" __MODULE_STRING(IP_VS_APP_MAX_PORTS) "i");
+
+/* Dummy variable */
+static int ip_vs_ftp_pasv;
+
+
+static int
+ip_vs_ftp_init_conn(struct ip_vs_app *app, struct ip_vs_conn *cp)
+{
+ return 0;
+}
+
+
+static int
+ip_vs_ftp_done_conn(struct ip_vs_app *app, struct ip_vs_conn *cp)
+{
+ return 0;
+}
+
+
+/*
+ * Get <addr,port> from the string "xxx.xxx.xxx.xxx,ppp,ppp", started
+ * with the "pattern" and terminated with the "term" character.
+ * <addr,port> is in network order.
+ */
+static int ip_vs_ftp_get_addrport(char *data, char *data_limit,
+ const char *pattern, size_t plen, char term,
+ __u32 *addr, __u16 *port,
+ char **start, char **end)
+{
+ unsigned char p1,p2,p3,p4,p5,p6;
+
+ while (data < data_limit) {
+ if (strnicmp(data, pattern, plen) != 0) {
+ data++;
+ continue;
+ }
+ *start = data+plen;
+ p1 = simple_strtoul(data+plen, &data, 10);
+ if (*data != ',')
+ continue;
+ p2 = simple_strtoul(data+1, &data, 10);
+ if (*data != ',')
+ continue;
+ p3 = simple_strtoul(data+1, &data, 10);
+ if (*data != ',')
+ continue;
+ p4 = simple_strtoul(data+1, &data, 10);
+ if (*data != ',')
+ continue;
+ p5 = simple_strtoul(data+1, &data, 10);
+ if (*data != ',')
+ continue;
+ p6 = simple_strtoul(data+1, &data, 10);
+ if (*data != term)
+ continue;
+
+ *end = data;
+ *addr = (p4<<24) | (p3<<16) | (p2<<8) | p1;
+ *port = (p6<<8) | p5;
+ return 1;
+ }
+ return 0;
+}
+
+
+/*
+ * Look at outgoing ftp packets to catch the response to a PASV command
+ * from the server (inside-to-outside).
+ * When we see one, we build a connection entry with the client address,
+ * client port 0 (unknown at the moment), the server address and the
+ * server port. Mark the current connection entry as a control channel
+ * of the new entry. All this work is just to make the data connection
+ * can be scheduled to the right server later.
+ *
+ * The outgoing packet should be something like
+ * "227 Entering Passive Mode (xxx,xxx,xxx,xxx,ppp,ppp)".
+ * xxx,xxx,xxx,xxx is the server address, ppp,ppp is the server port number.
+ */
+static int ip_vs_ftp_out(struct ip_vs_app *app,
+ struct ip_vs_conn *cp, struct sk_buff *skb)
+{
+ struct iphdr *iph;
+ struct tcphdr *th;
+ char *data, *data_limit;
+ char *start, *end;
+ __u32 from;
+ __u16 port;
+ struct ip_vs_conn *n_cp;
+ char buf[24]; /* xxx.xxx.xxx.xxx,ppp,ppp\000 */
+ unsigned buf_len;
+ int diff;
+
+ /* Only useful for established sessions */
+ if (cp->state != IP_VS_TCP_S_ESTABLISHED)
+ return 0;
+
+ if (cp->app_data == &ip_vs_ftp_pasv) {
+ iph = skb->nh.iph;
+ th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
+ data = (char *)th + (th->doff << 2);
+ data_limit = skb->tail;
+
+ if (ip_vs_ftp_get_addrport(data, data_limit,
+ SERVER_STRING,
+ sizeof(SERVER_STRING)-1, ')',
+ &from, &port,
+ &start, &end) == 0)
+ return 0;
+
+ IP_VS_DBG(1-debug, "PASV response (%u.%u.%u.%u:%d) -> "
+ "%u.%u.%u.%u:%d detected\n",
+ NIPQUAD(from), ntohs(port), NIPQUAD(cp->caddr), 0);
+
+ /*
+ * Now update or create an connection entry for it
+ */
+ n_cp = ip_vs_conn_out_get(iph->protocol, from, port,
+ cp->caddr, 0);
+ if (!n_cp) {
+ n_cp = ip_vs_conn_new(IPPROTO_TCP,
+ cp->caddr, 0,
+ cp->vaddr, port,
+ from, port,
+ IP_VS_CONN_F_NO_CPORT,
+ cp->dest);
+ if (!n_cp)
+ return 0;
+
+ /* add its controller */
+ ip_vs_control_add(n_cp, cp);
+ }
+
+ /*
+ * Replace the old passive address with the new one
+ */
+ from = n_cp->vaddr;
+ port = n_cp->vport;
+ sprintf(buf,"%d,%d,%d,%d,%d,%d", NIPQUAD(from),
+ port&255, port>>8&255);
+ buf_len = strlen(buf);
+
+ /*
+ * Calculate required delta-offset to keep TCP happy
+ */
+ diff = buf_len - (end-start);
+
+ if (diff == 0) {
+ /* simply replace it with new passive address */
+ memcpy(start, buf, buf_len);
+ } else {
+ /* fixme: return value isn't checked here */
+ ip_vs_skb_replace(skb, GFP_ATOMIC, start,
+ end-start, buf, buf_len);
+ }
+
+ cp->app_data = NULL;
+ ip_vs_tcp_conn_listen(n_cp);
+ ip_vs_conn_put(n_cp);
+ return diff;
+ }
+ return 0;
+}
+
+
+/*
+ * Look at incoming ftp packets to catch the PASV/PORT command
+ * (outside-to-inside).
+ *
+ * The incoming packet having the PORT command should be something like
+ * "PORT xxx,xxx,xxx,xxx,ppp,ppp\n".
+ * xxx,xxx,xxx,xxx is the client address, ppp,ppp is the client port number.
+ * In this case, we create a connection entry using the client address and
+ * port, so that the active ftp data connection from the server can reach
+ * the client.
+ */
+static int ip_vs_ftp_in(struct ip_vs_app *app,
+ struct ip_vs_conn *cp, struct sk_buff *skb)
+{
+ struct iphdr *iph;
+ struct tcphdr *th;
+ char *data, *data_start, *data_limit;
+ char *start, *end;
+ __u32 to;
+ __u16 port;
+ struct ip_vs_conn *n_cp;
+
+ /* Only useful for established sessions */
+ if (cp->state != IP_VS_TCP_S_ESTABLISHED)
+ return 0;
+
+ /*
+ * Detecting whether it is passive
+ */
+ iph = skb->nh.iph;
+ th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
+
+ /* Since there may be OPTIONS in the TCP packet and the HLEN is
+ the length of the header in 32-bit multiples, it is accurate
+ to calculate data address by th+HLEN*4 */
+ data = data_start = (char *)th + (th->doff << 2);
+ data_limit = skb->tail;
+
+ while (data < data_limit) {
+ if (strnicmp(data, "PASV\r\n", 6) == 0) {
+ IP_VS_DBG(1-debug, "got PASV at %d of %d\n",
+ data - data_start,
+ data_limit - data_start);
+ cp->app_data = &ip_vs_ftp_pasv;
+ return 0;
+ }
+ data++;
+ }
+
+ /*
+ * To support virtual FTP server, the scenerio is as follows:
+ * FTP client ----> Load Balancer ----> FTP server
+ * First detect the port number in the application data,
+ * then create a new connection entry for the coming data
+ * connection.
+ */
+ data = data_start;
+ data_limit = skb->h.raw + skb->len - 18;
+
+ if (ip_vs_ftp_get_addrport(data, data_limit,
+ CLIENT_STRING, sizeof(CLIENT_STRING)-1,
+ '\r', &to, &port,
+ &start, &end) == 0)
+ return 0;
+
+ IP_VS_DBG(1-debug, "PORT %u.%u.%u.%u:%d detected\n",
+ NIPQUAD(to), ntohs(port));
+
+ /*
+ * Now update or create a connection entry for it
+ */
+ IP_VS_DBG(1-debug, "protocol %s %u.%u.%u.%u:%d %u.%u.%u.%u:%d\n",
+ ip_vs_proto_name(iph->protocol),
+ NIPQUAD(to), ntohs(port), NIPQUAD(iph->daddr), 0);
+
+ n_cp = ip_vs_conn_in_get(iph->protocol,
+ to, port,
+ iph->daddr, htons(ntohs(cp->vport)-1));
+ if (!n_cp) {
+ n_cp = ip_vs_conn_new(IPPROTO_TCP,
+ to, port,
+ cp->vaddr, htons(ntohs(cp->vport)-1),
+ cp->daddr, htons(ntohs(cp->dport)-1),
+ 0,
+ cp->dest);
+ if (!n_cp)
+ return 0;
+
+ /* add its controller */
+ ip_vs_control_add(n_cp, cp);
+ }
+
+ /*
+ * Move tunnel to listen state
+ */
+ ip_vs_tcp_conn_listen(n_cp);
+ ip_vs_conn_put(n_cp);
+
+ /* no diff required for incoming packets */
+ return 0;
+}
+
+
+static struct ip_vs_app ip_vs_ftp = {
+ .name = "ftp",
+ .type = IP_VS_APP_TYPE_FTP,
+ .protocol = IPPROTO_TCP,
+ .module = THIS_MODULE,
+ .incs_list = LIST_HEAD_INIT(ip_vs_ftp.incs_list),
+ .init_conn = ip_vs_ftp_init_conn,
+ .done_conn = ip_vs_ftp_done_conn,
+ .bind_conn = NULL,
+ .unbind_conn = NULL,
+ .pkt_out = ip_vs_ftp_out,
+ .pkt_in = ip_vs_ftp_in,
+};
+
+
+/*
+ * ip_vs_ftp initialization
+ */
+static int __init ip_vs_ftp_init(void)
+{
+ int i, ret;
+ struct ip_vs_app *app = &ip_vs_ftp;
+
+ ret = register_ip_vs_app(app);
+ if (ret)
+ return ret;
+
+ for (i=0; i<IP_VS_APP_MAX_PORTS; i++) {
+ if (!ports[i])
+ continue;
+ ret = register_ip_vs_app_inc(app, app->protocol, ports[i]);
+ if (ret)
+ break;
+ IP_VS_DBG(1-debug, "%s: loaded support on port[%d] = %d\n",
+ app->name, i, ports[i]);
+ }
+
+ if (ret)
+ unregister_ip_vs_app(app);
+
+ return ret;
+}
+
+
+/*
+ * ip_vs_ftp finish.
+ */
+static void __exit ip_vs_ftp_exit(void)
+{
+ unregister_ip_vs_app(&ip_vs_ftp);
+}
+
+
+module_init(ip_vs_ftp_init);
+module_exit(ip_vs_ftp_exit);
+MODULE_LICENSE("GPL");
--- /dev/null
+/*
+ * IPVS: Locality-Based Least-Connection scheduling module
+ *
+ * Version: $Id: ip_vs_lblc.c,v 1.10 2002/09/15 08:14:08 wensong Exp $
+ *
+ * Authors: Wensong Zhang <wensong@gnuchina.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ * Martin Hamilton : fixed the terrible locking bugs
+ * *lock(tbl->lock) ==> *lock(&tbl->lock)
+ * Wensong Zhang : fixed the uninitilized tbl->lock bug
+ * Wensong Zhang : added doing full expiration check to
+ * collect stale entries of 24+ hours when
+ * no partial expire check in a half hour
+ * Julian Anastasov : replaced del_timer call with del_timer_sync
+ * to avoid the possible race between timer
+ * handler and del_timer thread in SMP
+ *
+ */
+
+/*
+ * The lblc algorithm is as follows (pseudo code):
+ *
+ * if cachenode[dest_ip] is null then
+ * n, cachenode[dest_ip] <- {weighted least-conn node};
+ * else
+ * n <- cachenode[dest_ip];
+ * if (n is dead) OR
+ * (n.conns>n.weight AND
+ * there is a node m with m.conns<m.weight/2) then
+ * n, cachenode[dest_ip] <- {weighted least-conn node};
+ *
+ * return n;
+ *
+ * Thanks must go to Wenzhuo Zhang for talking WCCP to me and pushing
+ * me to write this module.
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+
+/* for systcl */
+#include <linux/fs.h>
+#include <linux/sysctl.h>
+
+#include <net/ip_vs.h>
+
+
+/*
+ * It is for garbage collection of stale IPVS lblc entries,
+ * when the table is full.
+ */
+#define CHECK_EXPIRE_INTERVAL (60*HZ)
+#define ENTRY_TIMEOUT (6*60*HZ)
+
+/*
+ * It is for full expiration check.
+ * When there is no partial expiration check (garbage collection)
+ * in a half hour, do a full expiration check to collect stale
+ * entries that haven't been touched for a day.
+ */
+#define COUNT_FOR_FULL_EXPIRATION 30
+int sysctl_ip_vs_lblc_expiration = 24*60*60*HZ;
+
+
+/*
+ * for IPVS lblc entry hash table
+ */
+#ifndef CONFIG_IP_VS_LBLC_TAB_BITS
+#define CONFIG_IP_VS_LBLC_TAB_BITS 10
+#endif
+#define IP_VS_LBLC_TAB_BITS CONFIG_IP_VS_LBLC_TAB_BITS
+#define IP_VS_LBLC_TAB_SIZE (1 << IP_VS_LBLC_TAB_BITS)
+#define IP_VS_LBLC_TAB_MASK (IP_VS_LBLC_TAB_SIZE - 1)
+
+
+/*
+ * IPVS lblc entry represents an association between destination
+ * IP address and its destination server
+ */
+struct ip_vs_lblc_entry {
+ struct list_head list;
+ __u32 addr; /* destination IP address */
+ struct ip_vs_dest *dest; /* real server (cache) */
+ unsigned long lastuse; /* last used time */
+};
+
+
+/*
+ * IPVS lblc hash table
+ */
+struct ip_vs_lblc_table {
+ rwlock_t lock; /* lock for this table */
+ struct list_head bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */
+ atomic_t entries; /* number of entries */
+ int max_size; /* maximum size of entries */
+ struct timer_list periodic_timer; /* collect stale entries */
+ int rover; /* rover for expire check */
+ int counter; /* counter for no expire */
+};
+
+
+/*
+ * IPVS LBLC sysctl table
+ */
+struct ip_vs_lblc_sysctl_table {
+ struct ctl_table_header *sysctl_header;
+ ctl_table vs_vars[2];
+ ctl_table vs_dir[2];
+ ctl_table ipv4_dir[2];
+ ctl_table root_dir[2];
+};
+
+
+static struct ip_vs_lblc_sysctl_table lblc_sysctl_table = {
+ NULL,
+ {{NET_IPV4_VS_LBLC_EXPIRE, "lblc_expiration",
+ &sysctl_ip_vs_lblc_expiration,
+ sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
+ {0}},
+ {{NET_IPV4_VS, "vs", NULL, 0, 0555, lblc_sysctl_table.vs_vars},
+ {0}},
+ {{NET_IPV4, "ipv4", NULL, 0, 0555, lblc_sysctl_table.vs_dir},
+ {0}},
+ {{CTL_NET, "net", NULL, 0, 0555, lblc_sysctl_table.ipv4_dir},
+ {0}}
+};
+
+
+/*
+ * new/free a ip_vs_lblc_entry, which is a mapping of a destionation
+ * IP address to a server.
+ */
+static inline struct ip_vs_lblc_entry *
+ip_vs_lblc_new(__u32 daddr, struct ip_vs_dest *dest)
+{
+ struct ip_vs_lblc_entry *en;
+
+ en = kmalloc(sizeof(struct ip_vs_lblc_entry), GFP_ATOMIC);
+ if (en == NULL) {
+ IP_VS_ERR("ip_vs_lblc_new(): no memory\n");
+ return NULL;
+ }
+
+ INIT_LIST_HEAD(&en->list);
+ en->addr = daddr;
+
+ atomic_inc(&dest->refcnt);
+ en->dest = dest;
+
+ return en;
+}
+
+
+static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en)
+{
+ list_del(&en->list);
+ /*
+ * We don't kfree dest because it is refered either by its service
+ * or the trash dest list.
+ */
+ atomic_dec(&en->dest->refcnt);
+ kfree(en);
+}
+
+
+/*
+ * Returns hash value for IPVS LBLC entry
+ */
+static inline unsigned ip_vs_lblc_hashkey(__u32 addr)
+{
+ return (ntohl(addr)*2654435761UL) & IP_VS_LBLC_TAB_MASK;
+}
+
+
+/*
+ * Hash an entry in the ip_vs_lblc_table.
+ * returns bool success.
+ */
+static int
+ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en)
+{
+ unsigned hash;
+
+ if (!list_empty(&en->list)) {
+ IP_VS_ERR("ip_vs_lblc_hash(): request for already hashed, "
+ "called from %p\n", __builtin_return_address(0));
+ return 0;
+ }
+
+ /*
+ * Hash by destination IP address
+ */
+ hash = ip_vs_lblc_hashkey(en->addr);
+
+ write_lock(&tbl->lock);
+ list_add(&en->list, &tbl->bucket[hash]);
+ atomic_inc(&tbl->entries);
+ write_unlock(&tbl->lock);
+
+ return 1;
+}
+
+
+#if 0000
+/*
+ * Unhash ip_vs_lblc_entry from ip_vs_lblc_table.
+ * returns bool success.
+ */
+static int ip_vs_lblc_unhash(struct ip_vs_lblc_table *tbl,
+ struct ip_vs_lblc_entry *en)
+{
+ if (list_empty(&en->list)) {
+ IP_VS_ERR("ip_vs_lblc_unhash(): request for not hashed entry, "
+ "called from %p\n", __builtin_return_address(0));
+ return 0;
+ }
+
+ /*
+ * Remove it from the table
+ */
+ write_lock(&tbl->lock);
+ list_del(&en->list);
+ INIT_LIST_HEAD(&en->list);
+ write_unlock(&tbl->lock);
+
+ return 1;
+}
+#endif
+
+
+/*
+ * Get ip_vs_lblc_entry associated with supplied parameters.
+ */
+static inline struct ip_vs_lblc_entry *
+ip_vs_lblc_get(struct ip_vs_lblc_table *tbl, __u32 addr)
+{
+ unsigned hash;
+ struct ip_vs_lblc_entry *en;
+ struct list_head *l,*e;
+
+ hash = ip_vs_lblc_hashkey(addr);
+ l = &tbl->bucket[hash];
+
+ read_lock(&tbl->lock);
+
+ for (e=l->next; e!=l; e=e->next) {
+ en = list_entry(e, struct ip_vs_lblc_entry, list);
+ if (en->addr == addr) {
+ /* HIT */
+ read_unlock(&tbl->lock);
+ return en;
+ }
+ }
+
+ read_unlock(&tbl->lock);
+
+ return NULL;
+}
+
+
+/*
+ * Flush all the entries of the specified table.
+ */
+static void ip_vs_lblc_flush(struct ip_vs_lblc_table *tbl)
+{
+ int i;
+ struct list_head *l;
+ struct ip_vs_lblc_entry *en;
+
+ for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) {
+ write_lock(&tbl->lock);
+ for (l=&tbl->bucket[i]; l->next!=l; ) {
+ en = list_entry(l->next,
+ struct ip_vs_lblc_entry, list);
+ ip_vs_lblc_free(en);
+ atomic_dec(&tbl->entries);
+ }
+ write_unlock(&tbl->lock);
+ }
+}
+
+
+static inline void ip_vs_lblc_full_check(struct ip_vs_lblc_table *tbl)
+{
+ unsigned long now = jiffies;
+ int i, j;
+ struct list_head *l, *e;
+ struct ip_vs_lblc_entry *en;
+
+ for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) {
+ j = (j + 1) & IP_VS_LBLC_TAB_MASK;
+ e = l = &tbl->bucket[j];
+ write_lock(&tbl->lock);
+ while (e->next != l) {
+ en = list_entry(e->next,
+ struct ip_vs_lblc_entry, list);
+ if ((now - en->lastuse) <
+ sysctl_ip_vs_lblc_expiration) {
+ e = e->next;
+ continue;
+ }
+ ip_vs_lblc_free(en);
+ atomic_dec(&tbl->entries);
+ }
+ write_unlock(&tbl->lock);
+ }
+ tbl->rover = j;
+}
+
+
+/*
+ * Periodical timer handler for IPVS lblc table
+ * It is used to collect stale entries when the number of entries
+ * exceeds the maximum size of the table.
+ *
+ * Fixme: we probably need more complicated algorithm to collect
+ * entries that have not been used for a long time even
+ * if the number of entries doesn't exceed the maximum size
+ * of the table.
+ * The full expiration check is for this purpose now.
+ */
+static void ip_vs_lblc_check_expire(unsigned long data)
+{
+ struct ip_vs_lblc_table *tbl;
+ unsigned long now = jiffies;
+ int goal;
+ int i, j;
+ struct list_head *l, *e;
+ struct ip_vs_lblc_entry *en;
+
+ tbl = (struct ip_vs_lblc_table *)data;
+
+ if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
+ /* do full expiration check */
+ ip_vs_lblc_full_check(tbl);
+ tbl->counter = 1;
+ goto out;
+ }
+
+ if (atomic_read(&tbl->entries) <= tbl->max_size) {
+ tbl->counter++;
+ goto out;
+ }
+
+ goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3;
+ if (goal > tbl->max_size/2)
+ goal = tbl->max_size/2;
+
+ for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) {
+ j = (j + 1) & IP_VS_LBLC_TAB_MASK;
+ e = l = &tbl->bucket[j];
+ write_lock(&tbl->lock);
+ while (e->next != l) {
+ en = list_entry(e->next,
+ struct ip_vs_lblc_entry, list);
+ if ((now - en->lastuse) < ENTRY_TIMEOUT) {
+ e = e->next;
+ continue;
+ }
+ ip_vs_lblc_free(en);
+ atomic_dec(&tbl->entries);
+ goal--;
+ }
+ write_unlock(&tbl->lock);
+ if (goal <= 0)
+ break;
+ }
+ tbl->rover = j;
+
+ out:
+ mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL);
+}
+
+
+static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)
+{
+ int i;
+ struct ip_vs_lblc_table *tbl;
+
+ /*
+ * Allocate the ip_vs_lblc_table for this service
+ */
+ tbl = kmalloc(sizeof(struct ip_vs_lblc_table), GFP_ATOMIC);
+ if (tbl == NULL) {
+ IP_VS_ERR("ip_vs_lblc_init_svc(): no memory\n");
+ return -ENOMEM;
+ }
+ svc->sched_data = tbl;
+ IP_VS_DBG(6, "LBLC hash table (memory=%dbytes) allocated for "
+ "current service\n",
+ sizeof(struct ip_vs_lblc_table));
+
+ /*
+ * Initialize the hash buckets
+ */
+ for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) {
+ INIT_LIST_HEAD(&tbl->bucket[i]);
+ }
+ tbl->lock = RW_LOCK_UNLOCKED;
+ tbl->max_size = IP_VS_LBLC_TAB_SIZE*16;
+ tbl->rover = 0;
+ tbl->counter = 1;
+
+ /*
+ * Hook periodic timer for garbage collection
+ */
+ init_timer(&tbl->periodic_timer);
+ tbl->periodic_timer.data = (unsigned long)tbl;
+ tbl->periodic_timer.function = ip_vs_lblc_check_expire;
+ tbl->periodic_timer.expires = jiffies+CHECK_EXPIRE_INTERVAL;
+ add_timer(&tbl->periodic_timer);
+
+ return 0;
+}
+
+
+static int ip_vs_lblc_done_svc(struct ip_vs_service *svc)
+{
+ struct ip_vs_lblc_table *tbl = svc->sched_data;
+
+ /* remove periodic timer */
+ del_timer_sync(&tbl->periodic_timer);
+
+ /* got to clean up table entries here */
+ ip_vs_lblc_flush(tbl);
+
+ /* release the table itself */
+ kfree(svc->sched_data);
+ IP_VS_DBG(6, "LBLC hash table (memory=%dbytes) released\n",
+ sizeof(struct ip_vs_lblc_table));
+
+ return 0;
+}
+
+
+static int ip_vs_lblc_update_svc(struct ip_vs_service *svc)
+{
+ return 0;
+}
+
+
+static inline struct ip_vs_dest *
+__ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph)
+{
+ register struct list_head *l, *e;
+ struct ip_vs_dest *dest, *least;
+ int loh, doh;
+
+ /*
+ * We think the overhead of processing active connections is fifty
+ * times higher than that of inactive connections in average. (This
+ * fifty times might not be accurate, we will change it later.) We
+ * use the following formula to estimate the overhead:
+ * dest->activeconns*50 + dest->inactconns
+ * and the load:
+ * (dest overhead) / dest->weight
+ *
+ * Remember -- no floats in kernel mode!!!
+ * The comparison of h1*w2 > h2*w1 is equivalent to that of
+ * h1/w1 > h2/w2
+ * if every weight is larger than zero.
+ *
+ * The server with weight=0 is quiesced and will not receive any
+ * new connection.
+ */
+
+ l = &svc->destinations;
+ for (e=l->next; e!=l; e=e->next) {
+ least = list_entry(e, struct ip_vs_dest, n_list);
+ if (least->flags & IP_VS_DEST_F_OVERLOAD)
+ continue;
+ if (atomic_read(&least->weight) > 0) {
+ loh = atomic_read(&least->activeconns) * 50
+ + atomic_read(&least->inactconns);
+ goto nextstage;
+ }
+ }
+ return NULL;
+
+ /*
+ * Find the destination with the least load.
+ */
+ nextstage:
+ for (e=e->next; e!=l; e=e->next) {
+ dest = list_entry(e, struct ip_vs_dest, n_list);
+
+ if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+ continue;
+
+ doh = atomic_read(&dest->activeconns) * 50
+ + atomic_read(&dest->inactconns);
+ if (loh * atomic_read(&dest->weight) >
+ doh * atomic_read(&least->weight)) {
+ least = dest;
+ loh = doh;
+ }
+ }
+
+ IP_VS_DBG(6, "LBLC: server %d.%d.%d.%d:%d "
+ "activeconns %d refcnt %d weight %d overhead %d\n",
+ NIPQUAD(least->addr), ntohs(least->port),
+ atomic_read(&least->activeconns),
+ atomic_read(&least->refcnt),
+ atomic_read(&least->weight), loh);
+
+ return least;
+}
+
+
+/*
+ * If this destination server is overloaded and there is a less loaded
+ * server, then return true.
+ */
+static inline int
+is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
+{
+ if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) {
+ register struct list_head *l, *e;
+ struct ip_vs_dest *d;
+
+ l = &svc->destinations;
+ for (e=l->next; e!=l; e=e->next) {
+ d = list_entry(e, struct ip_vs_dest, n_list);
+ if (atomic_read(&d->activeconns)*2
+ < atomic_read(&d->weight)) {
+ return 1;
+ }
+ }
+ }
+ return 0;
+}
+
+
+/*
+ * Locality-Based (weighted) Least-Connection scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_lblc_schedule(struct ip_vs_service *svc, struct iphdr *iph)
+{
+ struct ip_vs_dest *dest;
+ struct ip_vs_lblc_table *tbl;
+ struct ip_vs_lblc_entry *en;
+
+ IP_VS_DBG(6, "ip_vs_lblc_schedule(): Scheduling...\n");
+
+ tbl = (struct ip_vs_lblc_table *)svc->sched_data;
+ en = ip_vs_lblc_get(tbl, iph->daddr);
+ if (en == NULL) {
+ dest = __ip_vs_wlc_schedule(svc, iph);
+ if (dest == NULL) {
+ IP_VS_DBG(1, "no destination available\n");
+ return NULL;
+ }
+ en = ip_vs_lblc_new(iph->daddr, dest);
+ if (en == NULL) {
+ return NULL;
+ }
+ ip_vs_lblc_hash(tbl, en);
+ } else {
+ dest = en->dest;
+ if (!(dest->flags & IP_VS_DEST_F_AVAILABLE)
+ || atomic_read(&dest->weight) <= 0
+ || is_overloaded(dest, svc)) {
+ dest = __ip_vs_wlc_schedule(svc, iph);
+ if (dest == NULL) {
+ IP_VS_DBG(1, "no destination available\n");
+ return NULL;
+ }
+ atomic_dec(&en->dest->refcnt);
+ atomic_inc(&dest->refcnt);
+ en->dest = dest;
+ }
+ }
+ en->lastuse = jiffies;
+
+ IP_VS_DBG(6, "LBLC: destination IP address %u.%u.%u.%u "
+ "--> server %u.%u.%u.%u:%d\n",
+ NIPQUAD(en->addr),
+ NIPQUAD(dest->addr),
+ ntohs(dest->port));
+
+ return dest;
+}
+
+
+/*
+ * IPVS LBLC Scheduler structure
+ */
+static struct ip_vs_scheduler ip_vs_lblc_scheduler =
+{
+ .name = "lblc",
+ .refcnt = ATOMIC_INIT(0),
+ .module = THIS_MODULE,
+ .init_service = ip_vs_lblc_init_svc,
+ .done_service = ip_vs_lblc_done_svc,
+ .update_service = ip_vs_lblc_update_svc,
+ .schedule = ip_vs_lblc_schedule,
+};
+
+
+static int __init ip_vs_lblc_init(void)
+{
+ INIT_LIST_HEAD(&ip_vs_lblc_scheduler.n_list);
+ lblc_sysctl_table.sysctl_header =
+ register_sysctl_table(lblc_sysctl_table.root_dir, 0);
+ return register_ip_vs_scheduler(&ip_vs_lblc_scheduler);
+}
+
+
+static void __exit ip_vs_lblc_cleanup(void)
+{
+ unregister_sysctl_table(lblc_sysctl_table.sysctl_header);
+ unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler);
+}
+
+
+module_init(ip_vs_lblc_init);
+module_exit(ip_vs_lblc_cleanup);
+MODULE_LICENSE("GPL");
--- /dev/null
+/*
+ * IPVS: Locality-Based Least-Connection with Replication scheduler
+ *
+ * Version: $Id: ip_vs_lblcr.c,v 1.11 2002/09/15 08:14:08 wensong Exp $
+ *
+ * Authors: Wensong Zhang <wensong@gnuchina.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ * Julian Anastasov : Added the missing (dest->weight>0)
+ * condition in the ip_vs_dest_set_max.
+ *
+ */
+
+/*
+ * The lblc/r algorithm is as follows (pseudo code):
+ *
+ * if serverSet[dest_ip] is null then
+ * n, serverSet[dest_ip] <- {weighted least-conn node};
+ * else
+ * n <- {least-conn (alive) node in serverSet[dest_ip]};
+ * if (n is null) OR
+ * (n.conns>n.weight AND
+ * there is a node m with m.conns<m.weight/2) then
+ * n <- {weighted least-conn node};
+ * add n to serverSet[dest_ip];
+ * if |serverSet[dest_ip]| > 1 AND
+ * now - serverSet[dest_ip].lastMod > T then
+ * m <- {most conn node in serverSet[dest_ip]};
+ * remove m from serverSet[dest_ip];
+ * if serverSet[dest_ip] changed then
+ * serverSet[dest_ip].lastMod <- now;
+ *
+ * return n;
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+
+/* for systcl */
+#include <linux/fs.h>
+#include <linux/sysctl.h>
+/* for proc_net_create/proc_net_remove */
+#include <linux/proc_fs.h>
+
+#include <net/ip_vs.h>
+
+
+/*
+ * It is for garbage collection of stale IPVS lblcr entries,
+ * when the table is full.
+ */
+#define CHECK_EXPIRE_INTERVAL (60*HZ)
+#define ENTRY_TIMEOUT (6*60*HZ)
+
+/*
+ * It is for full expiration check.
+ * When there is no partial expiration check (garbage collection)
+ * in a half hour, do a full expiration check to collect stale
+ * entries that haven't been touched for a day.
+ */
+#define COUNT_FOR_FULL_EXPIRATION 30
+int sysctl_ip_vs_lblcr_expiration = 24*60*60*HZ;
+
+
+/*
+ * for IPVS lblcr entry hash table
+ */
+#ifndef CONFIG_IP_VS_LBLCR_TAB_BITS
+#define CONFIG_IP_VS_LBLCR_TAB_BITS 10
+#endif
+#define IP_VS_LBLCR_TAB_BITS CONFIG_IP_VS_LBLCR_TAB_BITS
+#define IP_VS_LBLCR_TAB_SIZE (1 << IP_VS_LBLCR_TAB_BITS)
+#define IP_VS_LBLCR_TAB_MASK (IP_VS_LBLCR_TAB_SIZE - 1)
+
+
+/*
+ * IPVS destination set structure and operations
+ */
+struct ip_vs_dest_list {
+ struct ip_vs_dest_list *next; /* list link */
+ struct ip_vs_dest *dest; /* destination server */
+};
+
+struct ip_vs_dest_set {
+ atomic_t size; /* set size */
+ unsigned long lastmod; /* last modified time */
+ struct ip_vs_dest_list *list; /* destination list */
+ rwlock_t lock; /* lock for this list */
+};
+
+
+static struct ip_vs_dest_list *
+ip_vs_dest_set_insert(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
+{
+ struct ip_vs_dest_list *e;
+
+ for (e=set->list; e!=NULL; e=e->next) {
+ if (e->dest == dest)
+ /* already existed */
+ return NULL;
+ }
+
+ e = kmalloc(sizeof(struct ip_vs_dest_list), GFP_ATOMIC);
+ if (e == NULL) {
+ IP_VS_ERR("ip_vs_dest_set_insert(): no memory\n");
+ return NULL;
+ }
+
+ atomic_inc(&dest->refcnt);
+ e->dest = dest;
+
+ /* link it to the list */
+ write_lock(&set->lock);
+ e->next = set->list;
+ set->list = e;
+ atomic_inc(&set->size);
+ write_unlock(&set->lock);
+
+ set->lastmod = jiffies;
+ return e;
+}
+
+static void
+ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
+{
+ struct ip_vs_dest_list *e, **ep;
+
+ write_lock(&set->lock);
+ for (ep=&set->list, e=*ep; e!=NULL; e=*ep) {
+ if (e->dest == dest) {
+ /* HIT */
+ *ep = e->next;
+ atomic_dec(&set->size);
+ set->lastmod = jiffies;
+ atomic_dec(&e->dest->refcnt);
+ kfree(e);
+ break;
+ }
+ ep = &e->next;
+ }
+ write_unlock(&set->lock);
+}
+
+static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set)
+{
+ struct ip_vs_dest_list *e, **ep;
+
+ write_lock(&set->lock);
+ for (ep=&set->list, e=*ep; e!=NULL; e=*ep) {
+ *ep = e->next;
+ /*
+ * We don't kfree dest because it is refered either
+ * by its service or by the trash dest list.
+ */
+ atomic_dec(&e->dest->refcnt);
+ kfree(e);
+ }
+ write_unlock(&set->lock);
+}
+
+/* get weighted least-connection node in the destination set */
+static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set)
+{
+ register struct ip_vs_dest_list *e;
+ struct ip_vs_dest *dest, *least;
+ int loh, doh;
+
+ if (set == NULL)
+ return NULL;
+
+ read_lock(&set->lock);
+ /* select the first destination server, whose weight > 0 */
+ for (e=set->list; e!=NULL; e=e->next) {
+ least = e->dest;
+ if (least->flags & IP_VS_DEST_F_OVERLOAD)
+ continue;
+
+ if ((atomic_read(&least->weight) > 0)
+ && (least->flags & IP_VS_DEST_F_AVAILABLE)) {
+ loh = atomic_read(&least->activeconns) * 50
+ + atomic_read(&least->inactconns);
+ goto nextstage;
+ }
+ }
+ read_unlock(&set->lock);
+ return NULL;
+
+ /* find the destination with the weighted least load */
+ nextstage:
+ for (e=e->next; e!=NULL; e=e->next) {
+ dest = e->dest;
+ if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+ continue;
+
+ doh = atomic_read(&dest->activeconns) * 50
+ + atomic_read(&dest->inactconns);
+ if ((loh * atomic_read(&dest->weight) >
+ doh * atomic_read(&least->weight))
+ && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
+ least = dest;
+ loh = doh;
+ }
+ }
+ read_unlock(&set->lock);
+
+ IP_VS_DBG(6, "ip_vs_dest_set_min: server %d.%d.%d.%d:%d "
+ "activeconns %d refcnt %d weight %d overhead %d\n",
+ NIPQUAD(least->addr), ntohs(least->port),
+ atomic_read(&least->activeconns),
+ atomic_read(&least->refcnt),
+ atomic_read(&least->weight), loh);
+ return least;
+}
+
+
+/* get weighted most-connection node in the destination set */
+static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set)
+{
+ register struct ip_vs_dest_list *e;
+ struct ip_vs_dest *dest, *most;
+ int moh, doh;
+
+ if (set == NULL)
+ return NULL;
+
+ read_lock(&set->lock);
+ /* select the first destination server, whose weight > 0 */
+ for (e=set->list; e!=NULL; e=e->next) {
+ most = e->dest;
+ if (atomic_read(&most->weight) > 0) {
+ moh = atomic_read(&most->activeconns) * 50
+ + atomic_read(&most->inactconns);
+ goto nextstage;
+ }
+ }
+ read_unlock(&set->lock);
+ return NULL;
+
+ /* find the destination with the weighted most load */
+ nextstage:
+ for (e=e->next; e!=NULL; e=e->next) {
+ dest = e->dest;
+ doh = atomic_read(&dest->activeconns) * 50
+ + atomic_read(&dest->inactconns);
+ /* moh/mw < doh/dw ==> moh*dw < doh*mw, where mw,dw>0 */
+ if ((moh * atomic_read(&dest->weight) <
+ doh * atomic_read(&most->weight))
+ && (atomic_read(&dest->weight) > 0)) {
+ most = dest;
+ moh = doh;
+ }
+ }
+ read_unlock(&set->lock);
+
+ IP_VS_DBG(6, "ip_vs_dest_set_max: server %d.%d.%d.%d:%d "
+ "activeconns %d refcnt %d weight %d overhead %d\n",
+ NIPQUAD(most->addr), ntohs(most->port),
+ atomic_read(&most->activeconns),
+ atomic_read(&most->refcnt),
+ atomic_read(&most->weight), moh);
+ return most;
+}
+
+
+/*
+ * IPVS lblcr entry represents an association between destination
+ * IP address and its destination server set
+ */
+struct ip_vs_lblcr_entry {
+ struct list_head list;
+ __u32 addr; /* destination IP address */
+ struct ip_vs_dest_set set; /* destination server set */
+ unsigned long lastuse; /* last used time */
+};
+
+
+/*
+ * IPVS lblcr hash table
+ */
+struct ip_vs_lblcr_table {
+ rwlock_t lock; /* lock for this table */
+ struct list_head bucket[IP_VS_LBLCR_TAB_SIZE]; /* hash bucket */
+ atomic_t entries; /* number of entries */
+ int max_size; /* maximum size of entries */
+ struct timer_list periodic_timer; /* collect stale entries */
+ int rover; /* rover for expire check */
+ int counter; /* counter for no expire */
+};
+
+
+/*
+ * IPVS LBLCR sysctl table
+ */
+struct ip_vs_lblcr_sysctl_table {
+ struct ctl_table_header *sysctl_header;
+ ctl_table vs_vars[2];
+ ctl_table vs_dir[2];
+ ctl_table ipv4_dir[2];
+ ctl_table root_dir[2];
+};
+
+
+static struct ip_vs_lblcr_sysctl_table lblcr_sysctl_table = {
+ NULL,
+ {{NET_IPV4_VS_LBLCR_EXPIRE, "lblcr_expiration",
+ &sysctl_ip_vs_lblcr_expiration,
+ sizeof(int), 0644, NULL, &proc_dointvec_jiffies},
+ {0}},
+ {{NET_IPV4_VS, "vs", NULL, 0, 0555, lblcr_sysctl_table.vs_vars},
+ {0}},
+ {{NET_IPV4, "ipv4", NULL, 0, 0555, lblcr_sysctl_table.vs_dir},
+ {0}},
+ {{CTL_NET, "net", NULL, 0, 0555, lblcr_sysctl_table.ipv4_dir},
+ {0}}
+};
+
+
+/*
+ * new/free a ip_vs_lblcr_entry, which is a mapping of a destination
+ * IP address to a server.
+ */
+static inline struct ip_vs_lblcr_entry *ip_vs_lblcr_new(__u32 daddr)
+{
+ struct ip_vs_lblcr_entry *en;
+
+ en = kmalloc(sizeof(struct ip_vs_lblcr_entry), GFP_ATOMIC);
+ if (en == NULL) {
+ IP_VS_ERR("ip_vs_lblcr_new(): no memory\n");
+ return NULL;
+ }
+
+ INIT_LIST_HEAD(&en->list);
+ en->addr = daddr;
+
+ /* initilize its dest set */
+ atomic_set(&(en->set.size), 0);
+ en->set.list = NULL;
+ en->set.lock = RW_LOCK_UNLOCKED;
+
+ return en;
+}
+
+
+static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en)
+{
+ list_del(&en->list);
+ ip_vs_dest_set_eraseall(&en->set);
+ kfree(en);
+}
+
+
+/*
+ * Returns hash value for IPVS LBLCR entry
+ */
+static inline unsigned ip_vs_lblcr_hashkey(__u32 addr)
+{
+ return (ntohl(addr)*2654435761UL) & IP_VS_LBLCR_TAB_MASK;
+}
+
+
+/*
+ * Hash an entry in the ip_vs_lblcr_table.
+ * returns bool success.
+ */
+static int
+ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en)
+{
+ unsigned hash;
+
+ if (!list_empty(&en->list)) {
+ IP_VS_ERR("ip_vs_lblcr_hash(): request for already hashed, "
+ "called from %p\n", __builtin_return_address(0));
+ return 0;
+ }
+
+ /*
+ * Hash by destination IP address
+ */
+ hash = ip_vs_lblcr_hashkey(en->addr);
+
+ write_lock(&tbl->lock);
+ list_add(&en->list, &tbl->bucket[hash]);
+ atomic_inc(&tbl->entries);
+ write_unlock(&tbl->lock);
+
+ return 1;
+}
+
+
+#if 0000
+/*
+ * Unhash ip_vs_lblcr_entry from ip_vs_lblcr_table.
+ * returns bool success.
+ */
+static int ip_vs_lblcr_unhash(struct ip_vs_lblcr_table *tbl,
+ struct ip_vs_lblcr_entry *en)
+{
+ if (list_empty(&en->list)) {
+ IP_VS_ERR("ip_vs_lblcr_unhash(): request for not hashed entry, "
+ "called from %p\n", __builtin_return_address(0));
+ return 0;
+ }
+
+ /*
+ * Remove it from the table
+ */
+ write_lock(&tbl->lock);
+ list_del(&en->list);
+ INIT_LIST_HEAD(&en->list);
+ write_unlock(&tbl->lock);
+
+ return 1;
+}
+#endif
+
+
+/*
+ * Get ip_vs_lblcr_entry associated with supplied parameters.
+ */
+static inline struct ip_vs_lblcr_entry *
+ip_vs_lblcr_get(struct ip_vs_lblcr_table *tbl, __u32 addr)
+{
+ unsigned hash;
+ struct ip_vs_lblcr_entry *en;
+ struct list_head *l,*e;
+
+ hash = ip_vs_lblcr_hashkey(addr);
+ l = &tbl->bucket[hash];
+
+ read_lock(&tbl->lock);
+
+ for (e=l->next; e!=l; e=e->next) {
+ en = list_entry(e, struct ip_vs_lblcr_entry, list);
+ if (en->addr == addr) {
+ /* HIT */
+ read_unlock(&tbl->lock);
+ return en;
+ }
+ }
+
+ read_unlock(&tbl->lock);
+
+ return NULL;
+}
+
+
+/*
+ * Flush all the entries of the specified table.
+ */
+static void ip_vs_lblcr_flush(struct ip_vs_lblcr_table *tbl)
+{
+ int i;
+ struct list_head *l;
+ struct ip_vs_lblcr_entry *en;
+
+ for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
+ write_lock(&tbl->lock);
+ for (l=&tbl->bucket[i]; l->next!=l; ) {
+ en = list_entry(l->next,
+ struct ip_vs_lblcr_entry, list);
+ ip_vs_lblcr_free(en);
+ atomic_dec(&tbl->entries);
+ }
+ write_unlock(&tbl->lock);
+ }
+}
+
+
+static inline void ip_vs_lblcr_full_check(struct ip_vs_lblcr_table *tbl)
+{
+ unsigned long now = jiffies;
+ int i, j;
+ struct list_head *l, *e;
+ struct ip_vs_lblcr_entry *en;
+
+ for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
+ j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
+ e = l = &tbl->bucket[j];
+ write_lock(&tbl->lock);
+ while (e->next != l) {
+ en = list_entry(e->next,
+ struct ip_vs_lblcr_entry, list);
+ if ((now - en->lastuse) <
+ sysctl_ip_vs_lblcr_expiration) {
+ e = e->next;
+ continue;
+ }
+ ip_vs_lblcr_free(en);
+ atomic_dec(&tbl->entries);
+ }
+ write_unlock(&tbl->lock);
+ }
+ tbl->rover = j;
+}
+
+
+/*
+ * Periodical timer handler for IPVS lblcr table
+ * It is used to collect stale entries when the number of entries
+ * exceeds the maximum size of the table.
+ *
+ * Fixme: we probably need more complicated algorithm to collect
+ * entries that have not been used for a long time even
+ * if the number of entries doesn't exceed the maximum size
+ * of the table.
+ * The full expiration check is for this purpose now.
+ */
+static void ip_vs_lblcr_check_expire(unsigned long data)
+{
+ struct ip_vs_lblcr_table *tbl;
+ unsigned long now = jiffies;
+ int goal;
+ int i, j;
+ struct list_head *l, *e;
+ struct ip_vs_lblcr_entry *en;
+
+ tbl = (struct ip_vs_lblcr_table *)data;
+
+ if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
+ /* do full expiration check */
+ ip_vs_lblcr_full_check(tbl);
+ tbl->counter = 1;
+ goto out;
+ }
+
+ if (atomic_read(&tbl->entries) <= tbl->max_size) {
+ tbl->counter++;
+ goto out;
+ }
+
+ goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3;
+ if (goal > tbl->max_size/2)
+ goal = tbl->max_size/2;
+
+ for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
+ j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
+ e = l = &tbl->bucket[j];
+ write_lock(&tbl->lock);
+ while (e->next != l) {
+ en = list_entry(e->next,
+ struct ip_vs_lblcr_entry, list);
+ if ((now - en->lastuse) < ENTRY_TIMEOUT) {
+ e = e->next;
+ continue;
+ }
+ ip_vs_lblcr_free(en);
+ atomic_dec(&tbl->entries);
+ goal--;
+ }
+ write_unlock(&tbl->lock);
+ if (goal <= 0)
+ break;
+ }
+ tbl->rover = j;
+
+ out:
+ mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL);
+}
+
+
+#ifdef CONFIG_IP_VS_LBLCR_DEBUG
+static struct ip_vs_lblcr_table *lblcr_table_list;
+
+/*
+ * /proc/net/ip_vs_lblcr to display the mappings of
+ * destination IP address <==> its serverSet
+ */
+static int
+ip_vs_lblcr_getinfo(char *buffer, char **start, off_t offset, int length)
+{
+ off_t pos=0, begin;
+ int len=0, size;
+ struct ip_vs_lblcr_table *tbl;
+ unsigned long now = jiffies;
+ int i;
+ struct list_head *l, *e;
+ struct ip_vs_lblcr_entry *en;
+
+ tbl = lblcr_table_list;
+
+ size = sprintf(buffer, "LastTime Dest IP address Server set\n");
+ pos += size;
+ len += size;
+
+ for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
+ l = &tbl->bucket[i];
+ read_lock_bh(&tbl->lock);
+ for (e=l->next; e!=l; e=e->next) {
+ char tbuf[16];
+ struct ip_vs_dest_list *d;
+
+ en = list_entry(e, struct ip_vs_lblcr_entry, list);
+ sprintf(tbuf, "%u.%u.%u.%u", NIPQUAD(en->addr));
+ size = sprintf(buffer+len, "%8lu %-16s ",
+ now-en->lastuse, tbuf);
+
+ read_lock(&en->set.lock);
+ for (d=en->set.list; d!=NULL; d=d->next) {
+ size += sprintf(buffer+len+size,
+ "%u.%u.%u.%u ",
+ NIPQUAD(d->dest->addr));
+ }
+ read_unlock(&en->set.lock);
+ size += sprintf(buffer+len+size, "\n");
+ len += size;
+ pos += size;
+ if (pos <= offset)
+ len=0;
+ if (pos >= offset+length) {
+ read_unlock_bh(&tbl->lock);
+ goto done;
+ }
+ }
+ read_unlock_bh(&tbl->lock);
+ }
+
+ done:
+ begin = len - (pos - offset);
+ *start = buffer + begin;
+ len -= begin;
+ if(len>length)
+ len = length;
+ return len;
+}
+#endif
+
+
+static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
+{
+ int i;
+ struct ip_vs_lblcr_table *tbl;
+
+ /*
+ * Allocate the ip_vs_lblcr_table for this service
+ */
+ tbl = kmalloc(sizeof(struct ip_vs_lblcr_table), GFP_ATOMIC);
+ if (tbl == NULL) {
+ IP_VS_ERR("ip_vs_lblcr_init_svc(): no memory\n");
+ return -ENOMEM;
+ }
+ svc->sched_data = tbl;
+ IP_VS_DBG(6, "LBLCR hash table (memory=%dbytes) allocated for "
+ "current service\n",
+ sizeof(struct ip_vs_lblcr_table));
+
+ /*
+ * Initialize the hash buckets
+ */
+ for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
+ INIT_LIST_HEAD(&tbl->bucket[i]);
+ }
+ tbl->lock = RW_LOCK_UNLOCKED;
+ tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16;
+ tbl->rover = 0;
+ tbl->counter = 1;
+
+ /*
+ * Hook periodic timer for garbage collection
+ */
+ init_timer(&tbl->periodic_timer);
+ tbl->periodic_timer.data = (unsigned long)tbl;
+ tbl->periodic_timer.function = ip_vs_lblcr_check_expire;
+ tbl->periodic_timer.expires = jiffies+CHECK_EXPIRE_INTERVAL;
+ add_timer(&tbl->periodic_timer);
+
+#ifdef CONFIG_IP_VS_LBLCR_DEBUG
+ lblcr_table_list = tbl;
+#endif
+ return 0;
+}
+
+
+static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc)
+{
+ struct ip_vs_lblcr_table *tbl = svc->sched_data;
+
+ /* remove periodic timer */
+ del_timer_sync(&tbl->periodic_timer);
+
+ /* got to clean up table entries here */
+ ip_vs_lblcr_flush(tbl);
+
+ /* release the table itself */
+ kfree(svc->sched_data);
+ IP_VS_DBG(6, "LBLCR hash table (memory=%dbytes) released\n",
+ sizeof(struct ip_vs_lblcr_table));
+
+ return 0;
+}
+
+
+static int ip_vs_lblcr_update_svc(struct ip_vs_service *svc)
+{
+ return 0;
+}
+
+
+static inline struct ip_vs_dest *
+__ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph)
+{
+ register struct list_head *l, *e;
+ struct ip_vs_dest *dest, *least;
+ int loh, doh;
+
+ /*
+ * We think the overhead of processing active connections is fifty
+ * times higher than that of inactive connections in average. (This
+ * fifty times might not be accurate, we will change it later.) We
+ * use the following formula to estimate the overhead:
+ * dest->activeconns*50 + dest->inactconns
+ * and the load:
+ * (dest overhead) / dest->weight
+ *
+ * Remember -- no floats in kernel mode!!!
+ * The comparison of h1*w2 > h2*w1 is equivalent to that of
+ * h1/w1 > h2/w2
+ * if every weight is larger than zero.
+ *
+ * The server with weight=0 is quiesced and will not receive any
+ * new connection.
+ */
+
+ l = &svc->destinations;
+ for (e=l->next; e!=l; e=e->next) {
+ least = list_entry(e, struct ip_vs_dest, n_list);
+ if (least->flags & IP_VS_DEST_F_OVERLOAD)
+ continue;
+
+ if (atomic_read(&least->weight) > 0) {
+ loh = atomic_read(&least->activeconns) * 50
+ + atomic_read(&least->inactconns);
+ goto nextstage;
+ }
+ }
+ return NULL;
+
+ /*
+ * Find the destination with the least load.
+ */
+ nextstage:
+ for (e=e->next; e!=l; e=e->next) {
+ dest = list_entry(e, struct ip_vs_dest, n_list);
+ if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+ continue;
+
+ doh = atomic_read(&dest->activeconns) * 50
+ + atomic_read(&dest->inactconns);
+ if (loh * atomic_read(&dest->weight) >
+ doh * atomic_read(&least->weight)) {
+ least = dest;
+ loh = doh;
+ }
+ }
+
+ IP_VS_DBG(6, "LBLCR: server %d.%d.%d.%d:%d "
+ "activeconns %d refcnt %d weight %d overhead %d\n",
+ NIPQUAD(least->addr), ntohs(least->port),
+ atomic_read(&least->activeconns),
+ atomic_read(&least->refcnt),
+ atomic_read(&least->weight), loh);
+
+ return least;
+}
+
+
+/*
+ * If this destination server is overloaded and there is a less loaded
+ * server, then return true.
+ */
+static inline int
+is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
+{
+ if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) {
+ register struct list_head *l, *e;
+ struct ip_vs_dest *d;
+
+ l = &svc->destinations;
+ for (e=l->next; e!=l; e=e->next) {
+ d = list_entry(e, struct ip_vs_dest, n_list);
+ if (atomic_read(&d->activeconns)*2
+ < atomic_read(&d->weight)) {
+ return 1;
+ }
+ }
+ }
+ return 0;
+}
+
+
+/*
+ * Locality-Based (weighted) Least-Connection scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_lblcr_schedule(struct ip_vs_service *svc, struct iphdr *iph)
+{
+ struct ip_vs_dest *dest;
+ struct ip_vs_lblcr_table *tbl;
+ struct ip_vs_lblcr_entry *en;
+
+ IP_VS_DBG(6, "ip_vs_lblcr_schedule(): Scheduling...\n");
+
+ tbl = (struct ip_vs_lblcr_table *)svc->sched_data;
+ en = ip_vs_lblcr_get(tbl, iph->daddr);
+ if (en == NULL) {
+ dest = __ip_vs_wlc_schedule(svc, iph);
+ if (dest == NULL) {
+ IP_VS_DBG(1, "no destination available\n");
+ return NULL;
+ }
+ en = ip_vs_lblcr_new(iph->daddr);
+ if (en == NULL) {
+ return NULL;
+ }
+ ip_vs_dest_set_insert(&en->set, dest);
+ ip_vs_lblcr_hash(tbl, en);
+ } else {
+ dest = ip_vs_dest_set_min(&en->set);
+ if (!dest || is_overloaded(dest, svc)) {
+ dest = __ip_vs_wlc_schedule(svc, iph);
+ if (dest == NULL) {
+ IP_VS_DBG(1, "no destination available\n");
+ return NULL;
+ }
+ ip_vs_dest_set_insert(&en->set, dest);
+ }
+ if (atomic_read(&en->set.size) > 1 &&
+ jiffies-en->set.lastmod > sysctl_ip_vs_lblcr_expiration) {
+ struct ip_vs_dest *m;
+ m = ip_vs_dest_set_max(&en->set);
+ if (m)
+ ip_vs_dest_set_erase(&en->set, m);
+ }
+ }
+ en->lastuse = jiffies;
+
+ IP_VS_DBG(6, "LBLCR: destination IP address %u.%u.%u.%u "
+ "--> server %u.%u.%u.%u:%d\n",
+ NIPQUAD(en->addr),
+ NIPQUAD(dest->addr),
+ ntohs(dest->port));
+
+ return dest;
+}
+
+
+/*
+ * IPVS LBLCR Scheduler structure
+ */
+static struct ip_vs_scheduler ip_vs_lblcr_scheduler =
+{
+ .name = "lblcr",
+ .refcnt = ATOMIC_INIT(0),
+ .module = THIS_MODULE,
+ .init_service = ip_vs_lblcr_init_svc,
+ .done_service = ip_vs_lblcr_done_svc,
+ .update_service = ip_vs_lblcr_update_svc,
+ .schedule = ip_vs_lblcr_schedule,
+};
+
+
+static int __init ip_vs_lblcr_init(void)
+{
+ INIT_LIST_HEAD(&ip_vs_lblcr_scheduler.n_list);
+ lblcr_sysctl_table.sysctl_header =
+ register_sysctl_table(lblcr_sysctl_table.root_dir, 0);
+#ifdef CONFIG_IP_VS_LBLCR_DEBUG
+ proc_net_create("ip_vs_lblcr", 0, ip_vs_lblcr_getinfo);
+#endif
+ return register_ip_vs_scheduler(&ip_vs_lblcr_scheduler);
+}
+
+
+static void __exit ip_vs_lblcr_cleanup(void)
+{
+#ifdef CONFIG_IP_VS_LBLCR_DEBUG
+ proc_net_remove("ip_vs_lblcr");
+#endif
+ unregister_sysctl_table(lblcr_sysctl_table.sysctl_header);
+ unregister_ip_vs_scheduler(&ip_vs_lblcr_scheduler);
+}
+
+
+module_init(ip_vs_lblcr_init);
+module_exit(ip_vs_lblcr_cleanup);
+MODULE_LICENSE("GPL");
--- /dev/null
+/*
+ * IPVS: Least-Connection Scheduling module
+ *
+ * Version: $Id: ip_vs_lc.c,v 1.10 2003/04/18 09:03:16 wensong Exp $
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ * Wensong Zhang : added the ip_vs_lc_update_svc
+ * Wensong Zhang : added any dest with weight=0 is quiesced
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+
+#include <net/ip_vs.h>
+
+
+static int ip_vs_lc_init_svc(struct ip_vs_service *svc)
+{
+ return 0;
+}
+
+
+static int ip_vs_lc_done_svc(struct ip_vs_service *svc)
+{
+ return 0;
+}
+
+
+static int ip_vs_lc_update_svc(struct ip_vs_service *svc)
+{
+ return 0;
+}
+
+
+static inline unsigned int
+ip_vs_lc_dest_overhead(struct ip_vs_dest *dest)
+{
+ /*
+ * We think the overhead of processing active connections is 256
+ * times higher than that of inactive connections in average. (This
+ * 256 times might not be accurate, we will change it later) We
+ * use the following formula to estimate the overhead now:
+ * dest->activeconns*256 + dest->inactconns
+ */
+ return (atomic_read(&dest->activeconns) << 8) +
+ atomic_read(&dest->inactconns);
+}
+
+
+/*
+ * Least Connection scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_lc_schedule(struct ip_vs_service *svc, struct iphdr *iph)
+{
+ struct list_head *l, *e;
+ struct ip_vs_dest *dest, *least;
+ unsigned int loh, doh;
+
+ IP_VS_DBG(6, "ip_vs_lc_schedule(): Scheduling...\n");
+
+ /*
+ * Simply select the server with the least number of
+ * (activeconns<<5) + inactconns
+ * Except whose weight is equal to zero.
+ * If the weight is equal to zero, it means that the server is
+ * quiesced, the existing connections to the server still get
+ * served, but no new connection is assigned to the server.
+ */
+
+ l = &svc->destinations;
+ for (e=l->next; e!=l; e=e->next) {
+ least = list_entry (e, struct ip_vs_dest, n_list);
+ if (least->flags & IP_VS_DEST_F_OVERLOAD)
+ continue;
+ if (atomic_read(&least->weight) > 0) {
+ loh = ip_vs_lc_dest_overhead(least);
+ goto nextstage;
+ }
+ }
+ return NULL;
+
+ /*
+ * Find the destination with the least load.
+ */
+ nextstage:
+ for (e=e->next; e!=l; e=e->next) {
+ dest = list_entry(e, struct ip_vs_dest, n_list);
+ if ((dest->flags & IP_VS_DEST_F_OVERLOAD) ||
+ atomic_read(&dest->weight) == 0)
+ continue;
+ doh = ip_vs_lc_dest_overhead(dest);
+ if (doh < loh) {
+ least = dest;
+ loh = doh;
+ }
+ }
+
+ IP_VS_DBG(6, "LC: server %u.%u.%u.%u:%u activeconns %d inactconns %d\n",
+ NIPQUAD(least->addr), ntohs(least->port),
+ atomic_read(&least->activeconns),
+ atomic_read(&least->inactconns));
+
+ return least;
+}
+
+
+static struct ip_vs_scheduler ip_vs_lc_scheduler = {
+ .name = "lc",
+ .refcnt = ATOMIC_INIT(0),
+ .module = THIS_MODULE,
+ .init_service = ip_vs_lc_init_svc,
+ .done_service = ip_vs_lc_done_svc,
+ .update_service = ip_vs_lc_update_svc,
+ .schedule = ip_vs_lc_schedule,
+};
+
+
+static int __init ip_vs_lc_init(void)
+{
+ INIT_LIST_HEAD(&ip_vs_lc_scheduler.n_list);
+ return register_ip_vs_scheduler(&ip_vs_lc_scheduler) ;
+}
+
+static void __exit ip_vs_lc_cleanup(void)
+{
+ unregister_ip_vs_scheduler(&ip_vs_lc_scheduler);
+}
+
+module_init(ip_vs_lc_init);
+module_exit(ip_vs_lc_cleanup);
+MODULE_LICENSE("GPL");
--- /dev/null
+/*
+ * IPVS: Never Queue scheduling module
+ *
+ * Version: $Id: ip_vs_nq.c,v 1.2 2003/06/08 09:31:19 wensong Exp $
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+
+/*
+ * The NQ algorithm adopts a two-speed model. When there is an idle server
+ * available, the job will be sent to the idle server, instead of waiting
+ * for a fast one. When there is no idle server available, the job will be
+ * sent to the server that minimize its expected delay (The Shortest
+ * Expected Delay scheduling algorithm).
+ *
+ * See the following paper for more information:
+ * A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing
+ * in large heterogeneous systems. In Proceedings IEEE INFOCOM'88,
+ * pages 986-994, 1988.
+ *
+ * Thanks must go to Marko Buuri <marko@buuri.name> for talking NQ to me.
+ *
+ * The difference between NQ and SED is that NQ can improve overall
+ * system utilization.
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+
+#include <net/ip_vs.h>
+
+
+static int
+ip_vs_nq_init_svc(struct ip_vs_service *svc)
+{
+ return 0;
+}
+
+
+static int
+ip_vs_nq_done_svc(struct ip_vs_service *svc)
+{
+ return 0;
+}
+
+
+static int
+ip_vs_nq_update_svc(struct ip_vs_service *svc)
+{
+ return 0;
+}
+
+
+static inline unsigned int
+ip_vs_nq_dest_overhead(struct ip_vs_dest *dest)
+{
+ /*
+ * We only use the active connection number in the cost
+ * calculation here.
+ */
+ return atomic_read(&dest->activeconns) + 1;
+}
+
+
+/*
+ * Weighted Least Connection scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_nq_schedule(struct ip_vs_service *svc, struct iphdr *iph)
+{
+ register struct list_head *l, *e;
+ struct ip_vs_dest *dest, *least;
+ unsigned int loh, doh;
+
+ IP_VS_DBG(6, "ip_vs_nq_schedule(): Scheduling...\n");
+
+ /*
+ * We calculate the load of each dest server as follows:
+ * (server expected overhead) / dest->weight
+ *
+ * Remember -- no floats in kernel mode!!!
+ * The comparison of h1*w2 > h2*w1 is equivalent to that of
+ * h1/w1 > h2/w2
+ * if every weight is larger than zero.
+ *
+ * The server with weight=0 is quiesced and will not receive any
+ * new connections.
+ */
+
+ l = &svc->destinations;
+ for (e=l->next; e!=l; e=e->next) {
+ least = list_entry(e, struct ip_vs_dest, n_list);
+ if (!(least->flags & IP_VS_DEST_F_OVERLOAD) &&
+ atomic_read(&least->weight) > 0) {
+ loh = ip_vs_nq_dest_overhead(least);
+
+ /* return the server directly if it is idle */
+ if (atomic_read(&least->activeconns) == 0)
+ goto out;
+
+ goto nextstage;
+ }
+ }
+ return NULL;
+
+ /*
+ * Find the destination with the least load.
+ */
+ nextstage:
+ for (e=e->next; e!=l; e=e->next) {
+ dest = list_entry(e, struct ip_vs_dest, n_list);
+
+ if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+ continue;
+
+ doh = ip_vs_nq_dest_overhead(dest);
+
+ /* return the server directly if it is idle */
+ if (atomic_read(&dest->activeconns) == 0) {
+ least = dest;
+ goto out;
+ }
+
+ if (loh * atomic_read(&dest->weight) >
+ doh * atomic_read(&least->weight)) {
+ least = dest;
+ loh = doh;
+ }
+ }
+
+ out:
+ IP_VS_DBG(6, "NQ: server %u.%u.%u.%u:%u "
+ "activeconns %d refcnt %d weight %d overhead %d\n",
+ NIPQUAD(least->addr), ntohs(least->port),
+ atomic_read(&least->activeconns),
+ atomic_read(&least->refcnt),
+ atomic_read(&least->weight), loh);
+
+ return least;
+}
+
+
+static struct ip_vs_scheduler ip_vs_nq_scheduler =
+{
+ .name = "nq",
+ .refcnt = ATOMIC_INIT(0),
+ .module = THIS_MODULE,
+ .init_service = ip_vs_nq_init_svc,
+ .done_service = ip_vs_nq_done_svc,
+ .update_service = ip_vs_nq_update_svc,
+ .schedule = ip_vs_nq_schedule,
+};
+
+
+static int __init ip_vs_nq_init(void)
+{
+ INIT_LIST_HEAD(&ip_vs_nq_scheduler.n_list);
+ return register_ip_vs_scheduler(&ip_vs_nq_scheduler);
+}
+
+static void __exit ip_vs_nq_cleanup(void)
+{
+ unregister_ip_vs_scheduler(&ip_vs_nq_scheduler);
+}
+
+module_init(ip_vs_nq_init);
+module_exit(ip_vs_nq_cleanup);
+MODULE_LICENSE("GPL");
--- /dev/null
+/*
+ * ip_vs_proto.c: transport protocol load balancing support for IPVS
+ *
+ * Version: $Id: ip_vs_proto.c,v 1.2 2003/04/18 09:03:16 wensong Exp $
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ * Julian Anastasov <ja@ssi.bg>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/init.h>
+#include <net/protocol.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <asm/system.h>
+#include <linux/stat.h>
+#include <linux/proc_fs.h>
+
+#include <net/ip_vs.h>
+
+
+/*
+ * IPVS protocols can only be registered/unregistered when the ipvs
+ * module is loaded/unloaded, so no lock is needed in accessing the
+ * ipvs protocol table.
+ */
+
+#define IP_VS_PROTO_TAB_SIZE 32 /* must be power of 2 */
+#define IP_VS_PROTO_HASH(proto) ((proto) & (IP_VS_PROTO_TAB_SIZE-1))
+
+static struct ip_vs_protocol *ip_vs_proto_table[IP_VS_PROTO_TAB_SIZE];
+
+
+/*
+ * register an ipvs protocol
+ */
+int register_ip_vs_protocol(struct ip_vs_protocol *pp)
+{
+ unsigned hash = IP_VS_PROTO_HASH(pp->protocol);
+
+ pp->next = ip_vs_proto_table[hash];
+ ip_vs_proto_table[hash] = pp;
+
+ if (pp->init != NULL)
+ pp->init(pp);
+
+ return 0;
+}
+
+
+/*
+ * unregister an ipvs protocol
+ */
+int unregister_ip_vs_protocol(struct ip_vs_protocol *pp)
+{
+ struct ip_vs_protocol **pp_p;
+ unsigned hash = IP_VS_PROTO_HASH(pp->protocol);
+
+ pp_p = &ip_vs_proto_table[hash];
+ for (; *pp_p; pp_p = &(*pp_p)->next) {
+ if (*pp_p == pp) {
+ *pp_p = pp->next;
+ if (pp->exit != NULL)
+ pp->exit(pp);
+ return 0;
+ }
+ }
+
+ return -ESRCH;
+}
+
+
+/*
+ * get ip_vs_protocol object by its proto.
+ */
+struct ip_vs_protocol * ip_vs_proto_get(unsigned short proto)
+{
+ struct ip_vs_protocol *pp;
+ unsigned hash = IP_VS_PROTO_HASH(proto);
+
+ for (pp = ip_vs_proto_table[hash]; pp; pp = pp->next) {
+ if (pp->protocol == proto)
+ return pp;
+ }
+
+ return NULL;
+}
+
+
+/*
+ * Propagate event for state change to all protocols
+ */
+void ip_vs_protocol_timeout_change(int flags)
+{
+ struct ip_vs_protocol *pp;
+ int i;
+
+ for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) {
+ for (pp = ip_vs_proto_table[i]; pp; pp = pp->next) {
+ if (pp->timeout_change)
+ pp->timeout_change(pp, flags);
+ }
+ }
+}
+
+
+int *
+ip_vs_create_timeout_table(int *table, int size)
+{
+ int *t;
+
+ t = kmalloc(size, GFP_ATOMIC);
+ if (t == NULL)
+ return NULL;
+ memcpy(t, table, size);
+ return t;
+}
+
+
+/*
+ * Set timeout value for state specified by name
+ */
+int
+ip_vs_set_state_timeout(int *table, int num, char **names, char *name, int to)
+{
+ int i;
+
+ if (!table || !name || !to)
+ return -EINVAL;
+
+ for (i = 0; i < num; i++) {
+ if (strcmp(names[i], name))
+ continue;
+ table[i] = to * HZ;
+ return 0;
+ }
+ return -ENOENT;
+}
+
+
+const char * ip_vs_state_name(__u16 proto, int state)
+{
+ struct ip_vs_protocol *pp = ip_vs_proto_get(proto);
+
+ if (pp == NULL || pp->state_name == NULL)
+ return "ERR!";
+ return pp->state_name(state);
+}
+
+
+void
+tcpudp_debug_packet(struct ip_vs_protocol *pp, struct iphdr *iph, char *msg)
+{
+ char buf[128];
+ union ip_vs_tphdr h;
+
+ h.raw = (char *) iph + iph->ihl * 4;
+ if (iph->frag_off & __constant_htons(IP_OFFSET))
+ sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u frag",
+ pp->name, NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
+ else
+ sprintf(buf, "%s %u.%u.%u.%u:%u->%u.%u.%u.%u:%u",
+ pp->name,
+ NIPQUAD(iph->saddr),
+ ntohs(h.portp[0]),
+ NIPQUAD(iph->daddr),
+ ntohs(h.portp[1]));
+
+ printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
+}
+
+
+int ip_vs_protocol_init(void)
+{
+ char protocols[64];
+#define REGISTER_PROTOCOL(p) \
+ do { \
+ register_ip_vs_protocol(p); \
+ strcat(protocols, ", "); \
+ strcat(protocols, (p)->name); \
+ } while (0)
+
+ protocols[0] = '\0';
+ protocols[2] = '\0';
+#ifdef CONFIG_IP_VS_PROTO_TCP
+ REGISTER_PROTOCOL(&ip_vs_protocol_tcp);
+#endif
+#ifdef CONFIG_IP_VS_PROTO_UDP
+ REGISTER_PROTOCOL(&ip_vs_protocol_udp);
+#endif
+#ifdef CONFIG_IP_VS_PROTO_ICMP
+ REGISTER_PROTOCOL(&ip_vs_protocol_icmp);
+#endif
+#ifdef CONFIG_IP_VS_PROTO_AH
+ REGISTER_PROTOCOL(&ip_vs_protocol_ah);
+#endif
+#ifdef CONFIG_IP_VS_PROTO_ESP
+ REGISTER_PROTOCOL(&ip_vs_protocol_esp);
+#endif
+ IP_VS_INFO("Registered protocols (%s)\n", &protocols[2]);
+
+ return 0;
+}
+
+
+void ip_vs_protocol_cleanup(void)
+{
+ struct ip_vs_protocol *pp;
+ int i;
+
+ /* unregister all the ipvs protocols */
+ for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) {
+ while ((pp = ip_vs_proto_table[i]) != NULL)
+ unregister_ip_vs_protocol(pp);
+ }
+}
--- /dev/null
+/*
+ * ip_vs_proto_ah.c: AH IPSec load balancing support for IPVS
+ *
+ * Version: $Id: ip_vs_proto_ah.c,v 1.1 2003/07/04 15:04:37 wensong Exp $
+ *
+ * Authors: Julian Anastasov <ja@ssi.bg>, February 2002
+ * Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation;
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/compiler.h>
+#include <linux/vmalloc.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+
+#include <net/ip_vs.h>
+
+
+/* TODO:
+
+struct isakmp_hdr {
+ __u8 icookie[8];
+ __u8 rcookie[8];
+ __u8 np;
+ __u8 version;
+ __u8 xchgtype;
+ __u8 flags;
+ __u32 msgid;
+ __u32 length;
+};
+
+*/
+
+#define PORT_ISAKMP 500
+
+
+static struct ip_vs_conn *
+ah_conn_in_get(struct sk_buff *skb, struct ip_vs_protocol *pp,
+ struct iphdr *iph, union ip_vs_tphdr h, int inverse)
+{
+ struct ip_vs_conn *cp;
+
+ if (likely(!inverse)) {
+ cp = ip_vs_conn_in_get(IPPROTO_UDP,
+ iph->saddr,
+ __constant_htons(PORT_ISAKMP),
+ iph->daddr,
+ __constant_htons(PORT_ISAKMP));
+ } else {
+ cp = ip_vs_conn_in_get(IPPROTO_UDP,
+ iph->daddr,
+ __constant_htons(PORT_ISAKMP),
+ iph->saddr,
+ __constant_htons(PORT_ISAKMP));
+ }
+
+ if (!cp) {
+ /*
+ * We are not sure if the packet is from our
+ * service, so the caller should check skip_nonexisting
+ */
+ IP_VS_DBG(12, "Unknown ISAKMP entry for outin packet "
+ "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n",
+ inverse?"ICMP+":"",
+ pp->name,
+ NIPQUAD(iph->saddr),
+ NIPQUAD(iph->daddr));
+ }
+
+ return cp;
+}
+
+
+static struct ip_vs_conn *
+ah_conn_out_get(struct sk_buff *skb, struct ip_vs_protocol *pp,
+ struct iphdr *iph, union ip_vs_tphdr h, int inverse)
+{
+ struct ip_vs_conn *cp;
+
+ if (likely(!inverse)) {
+ cp = ip_vs_conn_out_get(IPPROTO_UDP,
+ iph->saddr,
+ __constant_htons(PORT_ISAKMP),
+ iph->daddr,
+ __constant_htons(PORT_ISAKMP));
+ } else {
+ cp = ip_vs_conn_out_get(IPPROTO_UDP,
+ iph->daddr,
+ __constant_htons(PORT_ISAKMP),
+ iph->saddr,
+ __constant_htons(PORT_ISAKMP));
+ }
+
+ if (!cp) {
+ /*
+ * We are not sure if the packet is from our
+ * service, so the caller should check skip_nonexisting
+ * or our conn_schedule hook should return NF_ACCEPT
+ */
+ IP_VS_DBG(12, "Unknown ISAKMP entry for inout packet "
+ "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n",
+ inverse ? "ICMP+" : "",
+ pp->name,
+ NIPQUAD(iph->saddr),
+ NIPQUAD(iph->daddr));
+ }
+
+ return cp;
+}
+
+
+static int
+ah_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp,
+ struct iphdr *iph, union ip_vs_tphdr h,
+ int *verdict, struct ip_vs_conn **cpp)
+{
+ /*
+ * AH is only related traffic. Pass the packet to IP stack.
+ */
+ *verdict = NF_ACCEPT;
+ return 0;
+}
+
+
+static void
+ah_debug_packet(struct ip_vs_protocol *pp, struct iphdr *iph, char *msg)
+{
+ char buf[256];
+
+ sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u",
+ pp->name, NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
+
+ printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
+}
+
+
+static void ah_init(struct ip_vs_protocol *pp)
+{
+ /* nothing to do now */
+}
+
+
+static void ah_exit(struct ip_vs_protocol *pp)
+{
+ /* nothing to do now */
+}
+
+
+struct ip_vs_protocol ip_vs_protocol_ah = {
+ .name = "AH",
+ .protocol = IPPROTO_AH,
+ .minhlen = 0,
+ .minhlen_icmp = 0,
+ .dont_defrag = 1,
+ .skip_nonexisting = 1,
+ .slave = 1,
+ .init = ah_init,
+ .exit = ah_exit,
+ .conn_schedule = ah_conn_schedule,
+ .conn_in_get = ah_conn_in_get,
+ .conn_out_get = ah_conn_out_get,
+ .snat_handler = NULL,
+ .dnat_handler = NULL,
+ .state_transition = NULL,
+ .register_app = NULL,
+ .unregister_app = NULL,
+ .app_conn_bind = NULL,
+ .csum_check = NULL,
+ .debug_packet = ah_debug_packet,
+ .timeout_change = NULL, /* ISAKMP */
+ .set_state_timeout = NULL,
+};
--- /dev/null
+/*
+ * ip_vs_proto_esp.c: ESP IPSec load balancing support for IPVS
+ *
+ * Version: $Id: ip_vs_proto_esp.c,v 1.1 2003/07/04 15:04:37 wensong Exp $
+ *
+ * Authors: Julian Anastasov <ja@ssi.bg>, February 2002
+ * Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation;
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/compiler.h>
+#include <linux/vmalloc.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+
+#include <net/ip_vs.h>
+
+
+/* TODO:
+
+struct isakmp_hdr {
+ __u8 icookie[8];
+ __u8 rcookie[8];
+ __u8 np;
+ __u8 version;
+ __u8 xchgtype;
+ __u8 flags;
+ __u32 msgid;
+ __u32 length;
+};
+
+*/
+
+#define PORT_ISAKMP 500
+
+
+static struct ip_vs_conn *
+esp_conn_in_get(struct sk_buff *skb, struct ip_vs_protocol *pp,
+ struct iphdr *iph, union ip_vs_tphdr h, int inverse)
+{
+ struct ip_vs_conn *cp;
+
+ if (likely(!inverse)) {
+ cp = ip_vs_conn_in_get(IPPROTO_UDP,
+ iph->saddr,
+ __constant_htons(PORT_ISAKMP),
+ iph->daddr,
+ __constant_htons(PORT_ISAKMP));
+ } else {
+ cp = ip_vs_conn_in_get(IPPROTO_UDP,
+ iph->daddr,
+ __constant_htons(PORT_ISAKMP),
+ iph->saddr,
+ __constant_htons(PORT_ISAKMP));
+ }
+
+ if (!cp) {
+ /*
+ * We are not sure if the packet is from our
+ * service, so the caller should check skip_nonexisting
+ */
+ IP_VS_DBG(12, "Unknown ISAKMP entry for outin packet "
+ "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n",
+ inverse ? "ICMP+" : "",
+ pp->name,
+ NIPQUAD(iph->saddr),
+ NIPQUAD(iph->daddr));
+ }
+
+ return cp;
+}
+
+
+static struct ip_vs_conn *
+esp_conn_out_get(struct sk_buff *skb, struct ip_vs_protocol *pp,
+ struct iphdr *iph, union ip_vs_tphdr h, int inverse)
+{
+ struct ip_vs_conn *cp;
+
+ if (likely(!inverse)) {
+ cp = ip_vs_conn_out_get(IPPROTO_UDP,
+ iph->saddr,
+ __constant_htons(PORT_ISAKMP),
+ iph->daddr,
+ __constant_htons(PORT_ISAKMP));
+ } else {
+ cp = ip_vs_conn_out_get(IPPROTO_UDP,
+ iph->daddr,
+ __constant_htons(PORT_ISAKMP),
+ iph->saddr,
+ __constant_htons(PORT_ISAKMP));
+ }
+
+ if (!cp) {
+ /*
+ * We are not sure if the packet is from our
+ * service, so the caller should check skip_nonexisting
+ * or our conn_schedule hook should return NF_ACCEPT
+ */
+ IP_VS_DBG(12, "Unknown ISAKMP entry for inout packet "
+ "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n",
+ inverse?"ICMP+":"",
+ pp->name,
+ NIPQUAD(iph->saddr),
+ NIPQUAD(iph->daddr));
+ }
+
+ return cp;
+}
+
+
+static int
+esp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp,
+ struct iphdr *iph, union ip_vs_tphdr h,
+ int *verdict, struct ip_vs_conn **cpp)
+{
+ /*
+ * ESP is only related traffic. Pass the packet to IP stack.
+ */
+ *verdict = NF_ACCEPT;
+ return 0;
+}
+
+
+static void
+esp_debug_packet(struct ip_vs_protocol *pp, struct iphdr *iph, char *msg)
+{
+ char buf[256];
+
+ sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u",
+ pp->name, NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
+
+ printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
+}
+
+
+static void esp_init(struct ip_vs_protocol *pp)
+{
+ /* nothing to do now */
+}
+
+
+static void esp_exit(struct ip_vs_protocol *pp)
+{
+ /* nothing to do now */
+}
+
+
+struct ip_vs_protocol ip_vs_protocol_esp = {
+ .name = "ESP",
+ .protocol = IPPROTO_ESP,
+ .minhlen = 0,
+ .minhlen_icmp = 0,
+ .dont_defrag = 1,
+ .skip_nonexisting = 1,
+ .slave = 1,
+ .init = esp_init,
+ .exit = esp_exit,
+ .conn_schedule = esp_conn_schedule,
+ .conn_in_get = esp_conn_in_get,
+ .conn_out_get = esp_conn_out_get,
+ .snat_handler = NULL,
+ .dnat_handler = NULL,
+ .csum_check = NULL,
+ .state_transition = NULL,
+ .register_app = NULL,
+ .unregister_app = NULL,
+ .app_conn_bind = NULL,
+ .debug_packet = esp_debug_packet,
+ .timeout_change = NULL, /* ISAKMP */
+};
--- /dev/null
+/*
+ * ip_vs_proto_icmp.c: ICMP load balancing support for IP Virtual Server
+ *
+ * Authors: Julian Anastasov <ja@ssi.bg>, March 2002
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation;
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/compiler.h>
+#include <linux/vmalloc.h>
+#include <linux/icmp.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+
+#include <net/ip_vs.h>
+
+
+static int icmp_timeouts[1] = { 1*60*HZ };
+
+static char * icmp_state_name_table[1] = { "ICMP" };
+
+struct ip_vs_conn *
+icmp_conn_in_get(struct sk_buff *skb, struct ip_vs_protocol *pp,
+ struct iphdr *iph, union ip_vs_tphdr h, int inverse)
+{
+#if 0
+ struct ip_vs_conn *cp;
+
+ if (likely(!inverse)) {
+ cp = ip_vs_conn_in_get(iph->protocol,
+ iph->saddr, 0,
+ iph->daddr, 0);
+ } else {
+ cp = ip_vs_conn_in_get(iph->protocol,
+ iph->daddr, 0,
+ iph->saddr, 0);
+ }
+
+ return cp;
+
+#else
+ return NULL;
+#endif
+}
+
+struct ip_vs_conn *
+icmp_conn_out_get(struct sk_buff *skb, struct ip_vs_protocol *pp,
+ struct iphdr *iph, union ip_vs_tphdr h, int inverse)
+{
+#if 0
+ struct ip_vs_conn *cp;
+
+ if (likely(!inverse)) {
+ cp = ip_vs_conn_out_get(iph->protocol,
+ iph->saddr, 0,
+ iph->daddr, 0);
+ } else {
+ cp = ip_vs_conn_out_get(IPPROTO_UDP,
+ iph->daddr, 0,
+ iph->saddr, 0);
+ }
+
+ return cp;
+#else
+ return NULL;
+#endif
+}
+
+static int
+icmp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp,
+ struct iphdr *iph, union ip_vs_tphdr h,
+ int *verdict, struct ip_vs_conn **cpp)
+{
+ *verdict = NF_ACCEPT;
+ return 0;
+}
+
+static int
+icmp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp,
+ struct iphdr *iph, union ip_vs_tphdr h, int size)
+{
+ if (!(iph->frag_off & __constant_htons(IP_OFFSET))) {
+ if (ip_compute_csum(h.raw, size)) {
+ IP_VS_DBG_RL_PKT(0, pp, iph, "Failed checksum for");
+ return 0;
+ }
+ }
+ return 1;
+
+}
+
+static void
+icmp_debug_packet(struct ip_vs_protocol *pp, struct iphdr *iph, char *msg)
+{
+ char buf[256];
+ union ip_vs_tphdr h;
+
+ h.raw = (char *) iph + iph->ihl * 4;
+ if (iph->frag_off & __constant_htons(IP_OFFSET))
+ sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u frag",
+ pp->name, NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
+ else
+ sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u T:%d C:%d",
+ pp->name, NIPQUAD(iph->saddr), NIPQUAD(iph->daddr),
+ h.icmph->type, h.icmph->code);
+
+ printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
+}
+
+static int
+icmp_state_transition(struct ip_vs_conn *cp,
+ int direction, struct iphdr *iph,
+ union ip_vs_tphdr h, struct ip_vs_protocol *pp)
+{
+ cp->timeout = pp->timeout_table[IP_VS_ICMP_S_NORMAL];
+ return 1;
+}
+
+static int
+icmp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to)
+{
+ int num;
+ char **names;
+
+ num = IP_VS_ICMP_S_LAST;
+ names = icmp_state_name_table;
+ return ip_vs_set_state_timeout(pp->timeout_table, num, names, sname, to);
+}
+
+
+static void icmp_init(struct ip_vs_protocol *pp)
+{
+ pp->timeout_table = icmp_timeouts;
+}
+
+static void icmp_exit(struct ip_vs_protocol *pp)
+{
+}
+
+struct ip_vs_protocol ip_vs_protocol_icmp = {
+ .name = "ICMP",
+ .protocol = IPPROTO_ICMP,
+ .minhlen = sizeof(struct icmphdr),
+ .minhlen_icmp = 8,
+ .dont_defrag = 0,
+ .skip_nonexisting = 0,
+ .slave = 0,
+ .init = icmp_init,
+ .exit = icmp_exit,
+ .conn_schedule = icmp_conn_schedule,
+ .conn_in_get = icmp_conn_in_get,
+ .conn_out_get = icmp_conn_out_get,
+ .snat_handler = NULL,
+ .dnat_handler = NULL,
+ .csum_check = icmp_csum_check,
+ .state_transition = icmp_state_transition,
+ .register_app = NULL,
+ .unregister_app = NULL,
+ .app_conn_bind = NULL,
+ .debug_packet = icmp_debug_packet,
+ .timeout_change = NULL,
+ .set_state_timeout = icmp_set_state_timeout,
+};
--- /dev/null
+/*
+ * ip_vs_proto_tcp.c: TCP load balancing support for IPVS
+ *
+ * Version: $Id: ip_vs_proto_tcp.c,v 1.3 2002/11/30 01:50:35 wensong Exp $
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ * Julian Anastasov <ja@ssi.bg>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/compiler.h>
+#include <linux/ip.h>
+#include <linux/tcp.h> /* for tcphdr */
+#include <net/ip.h>
+#include <net/tcp.h> /* for csum_tcpudp_magic */
+#include <linux/netfilter.h>
+
+#include <net/ip_vs.h>
+
+
+static struct ip_vs_conn *
+tcp_conn_in_get(struct sk_buff *skb, struct ip_vs_protocol *pp,
+ struct iphdr *iph, union ip_vs_tphdr h, int inverse)
+{
+ if (likely(!inverse)) {
+ return ip_vs_conn_in_get(iph->protocol,
+ iph->saddr, h.th->source,
+ iph->daddr, h.th->dest);
+ } else {
+ return ip_vs_conn_in_get(iph->protocol,
+ iph->daddr, h.th->dest,
+ iph->saddr, h.th->source);
+ }
+}
+
+static struct ip_vs_conn *
+tcp_conn_out_get(struct sk_buff *skb, struct ip_vs_protocol *pp,
+ struct iphdr *iph, union ip_vs_tphdr h, int inverse)
+{
+ if (likely(!inverse)) {
+ return ip_vs_conn_out_get(iph->protocol,
+ iph->saddr, h.th->source,
+ iph->daddr, h.th->dest);
+ } else {
+ return ip_vs_conn_out_get(iph->protocol,
+ iph->daddr, h.th->dest,
+ iph->saddr, h.th->source);
+ }
+}
+
+
+static int
+tcp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp,
+ struct iphdr *iph, union ip_vs_tphdr h,
+ int *verdict, struct ip_vs_conn **cpp)
+{
+ struct ip_vs_service *svc;
+
+ if (h.th->syn &&
+ (svc = ip_vs_service_get(skb->nfmark, iph->protocol,
+ iph->daddr, h.portp[1]))) {
+ if (ip_vs_todrop()) {
+ /*
+ * It seems that we are very loaded.
+ * We have to drop this packet :(
+ */
+ ip_vs_service_put(svc);
+ *verdict = NF_DROP;
+ return 0;
+ }
+
+ /*
+ * Let the virtual server select a real server for the
+ * incoming connection, and create a connection entry.
+ */
+ *cpp = ip_vs_schedule(svc, iph);
+ if (!*cpp) {
+ *verdict = ip_vs_leave(svc, skb, pp, h);
+ return 0;
+ }
+ ip_vs_service_put(svc);
+ }
+ return 1;
+}
+
+
+static inline void
+tcp_fast_csum_update(union ip_vs_tphdr *h, u32 oldip, u32 newip,
+ u16 oldport, u16 newport)
+{
+ h->th->check =
+ ip_vs_check_diff(~oldip, newip,
+ ip_vs_check_diff(oldport ^ 0xFFFF,
+ newport, h->th->check));
+}
+
+
+static int
+tcp_snat_handler(struct sk_buff *skb,
+ struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
+ struct iphdr *iph, union ip_vs_tphdr h, int size)
+{
+ int ihl = (char *) h.raw - (char *) iph;
+
+ /* We are sure that we work on first fragment */
+
+ h.th->source = cp->vport;
+
+ /* Call application helper if needed */
+ if (ip_vs_app_pkt_out(cp, skb) != 0) {
+ /* skb data has probably changed, update pointers */
+ iph = skb->nh.iph;
+ h.raw = (char*)iph + ihl;
+ size = skb->len - ihl;
+ }
+
+ /* Adjust TCP checksums */
+ if (!cp->app) {
+ /* Only port and addr are changed, do fast csum update */
+ tcp_fast_csum_update(&h, cp->daddr, cp->vaddr,
+ cp->dport, cp->vport);
+ if (skb->ip_summed == CHECKSUM_HW)
+ skb->ip_summed = CHECKSUM_NONE;
+ } else {
+ /* full checksum calculation */
+ h.th->check = 0;
+ skb->csum = csum_partial(h.raw, size, 0);
+ h.th->check = csum_tcpudp_magic(iph->saddr, iph->daddr,
+ size, iph->protocol,
+ skb->csum);
+ IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%d)\n",
+ pp->name, h.th->check,
+ (char*)&(h.th->check) - (char*)h.raw);
+ }
+ return 1;
+}
+
+
+static int
+tcp_dnat_handler(struct sk_buff *skb,
+ struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
+ struct iphdr *iph, union ip_vs_tphdr h, int size)
+{
+ int ihl = (char *) h.raw - (char *) iph;
+
+ /* We are sure that we work on first fragment */
+
+ h.th->dest = cp->dport;
+
+ /*
+ * Attempt ip_vs_app call.
+ * It will fix ip_vs_conn and iph ack_seq stuff
+ */
+ if (ip_vs_app_pkt_in(cp, skb) != 0) {
+ /* skb data has probably changed, update pointers */
+ iph = skb->nh.iph;
+ h.raw = (char*) iph + ihl;
+ size = skb->len - ihl;
+ }
+
+ /*
+ * Adjust TCP/UDP checksums
+ */
+ if (!cp->app) {
+ /* Only port and addr are changed, do fast csum update */
+ tcp_fast_csum_update(&h, cp->vaddr, cp->daddr,
+ cp->vport, cp->dport);
+ if (skb->ip_summed == CHECKSUM_HW)
+ skb->ip_summed = CHECKSUM_NONE;
+ } else {
+ /* full checksum calculation */
+ h.th->check = 0;
+ h.th->check = csum_tcpudp_magic(iph->saddr, iph->daddr,
+ size, iph->protocol,
+ csum_partial(h.raw, size, 0));
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ }
+ return 1;
+}
+
+
+static int
+tcp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp,
+ struct iphdr *iph, union ip_vs_tphdr h, int size)
+{
+ switch (skb->ip_summed) {
+ case CHECKSUM_NONE:
+ skb->csum = csum_partial(h.raw, size, 0);
+ case CHECKSUM_HW:
+ if (csum_tcpudp_magic(iph->saddr, iph->daddr, size,
+ iph->protocol, skb->csum)) {
+ IP_VS_DBG_RL_PKT(0, pp, iph,
+ "Failed checksum for");
+ return 0;
+ }
+ break;
+ default:
+ /* CHECKSUM_UNNECESSARY */
+ break;
+ }
+
+ return 1;
+}
+
+
+#define TCP_DIR_INPUT 0
+#define TCP_DIR_OUTPUT 4
+#define TCP_DIR_INPUT_ONLY 8
+
+static int tcp_state_off[IP_VS_DIR_LAST] = {
+ [IP_VS_DIR_INPUT] = TCP_DIR_INPUT,
+ [IP_VS_DIR_OUTPUT] = TCP_DIR_OUTPUT,
+ [IP_VS_DIR_INPUT_ONLY] = TCP_DIR_INPUT_ONLY,
+};
+
+/*
+ * Timeout table[state]
+ */
+static int tcp_timeouts[IP_VS_TCP_S_LAST+1] = {
+ [IP_VS_TCP_S_NONE] = 2*HZ,
+ [IP_VS_TCP_S_ESTABLISHED] = 15*60*HZ,
+ [IP_VS_TCP_S_SYN_SENT] = 2*60*HZ,
+ [IP_VS_TCP_S_SYN_RECV] = 1*60*HZ,
+ [IP_VS_TCP_S_FIN_WAIT] = 2*60*HZ,
+ [IP_VS_TCP_S_TIME_WAIT] = 2*60*HZ,
+ [IP_VS_TCP_S_CLOSE] = 10*HZ,
+ [IP_VS_TCP_S_CLOSE_WAIT] = 60*HZ,
+ [IP_VS_TCP_S_LAST_ACK] = 30*HZ,
+ [IP_VS_TCP_S_LISTEN] = 2*60*HZ,
+ [IP_VS_TCP_S_SYNACK] = 120*HZ,
+ [IP_VS_TCP_S_LAST] = 2*HZ,
+};
+
+
+#if 0
+
+/* FIXME: This is going to die */
+
+static int tcp_timeouts_dos[IP_VS_TCP_S_LAST+1] = {
+ [IP_VS_TCP_S_NONE] = 2*HZ,
+ [IP_VS_TCP_S_ESTABLISHED] = 8*60*HZ,
+ [IP_VS_TCP_S_SYN_SENT] = 60*HZ,
+ [IP_VS_TCP_S_SYN_RECV] = 10*HZ,
+ [IP_VS_TCP_S_FIN_WAIT] = 60*HZ,
+ [IP_VS_TCP_S_TIME_WAIT] = 60*HZ,
+ [IP_VS_TCP_S_CLOSE] = 10*HZ,
+ [IP_VS_TCP_S_CLOSE_WAIT] = 60*HZ,
+ [IP_VS_TCP_S_LAST_ACK] = 30*HZ,
+ [IP_VS_TCP_S_LISTEN] = 2*60*HZ,
+ [IP_VS_TCP_S_SYNACK] = 100*HZ,
+ [IP_VS_TCP_S_LAST] = 2*HZ,
+};
+
+#endif
+
+static char * tcp_state_name_table[IP_VS_TCP_S_LAST+1] = {
+ [IP_VS_TCP_S_NONE] = "NONE",
+ [IP_VS_TCP_S_ESTABLISHED] = "ESTABLISHED",
+ [IP_VS_TCP_S_SYN_SENT] = "SYN_SENT",
+ [IP_VS_TCP_S_SYN_RECV] = "SYN_RECV",
+ [IP_VS_TCP_S_FIN_WAIT] = "FIN_WAIT",
+ [IP_VS_TCP_S_TIME_WAIT] = "TIME_WAIT",
+ [IP_VS_TCP_S_CLOSE] = "CLOSE",
+ [IP_VS_TCP_S_CLOSE_WAIT] = "CLOSE_WAIT",
+ [IP_VS_TCP_S_LAST_ACK] = "LAST_ACK",
+ [IP_VS_TCP_S_LISTEN] = "LISTEN",
+ [IP_VS_TCP_S_SYNACK] = "SYNACK",
+ [IP_VS_TCP_S_LAST] = "BUG!",
+};
+
+#define sNO IP_VS_TCP_S_NONE
+#define sES IP_VS_TCP_S_ESTABLISHED
+#define sSS IP_VS_TCP_S_SYN_SENT
+#define sSR IP_VS_TCP_S_SYN_RECV
+#define sFW IP_VS_TCP_S_FIN_WAIT
+#define sTW IP_VS_TCP_S_TIME_WAIT
+#define sCL IP_VS_TCP_S_CLOSE
+#define sCW IP_VS_TCP_S_CLOSE_WAIT
+#define sLA IP_VS_TCP_S_LAST_ACK
+#define sLI IP_VS_TCP_S_LISTEN
+#define sSA IP_VS_TCP_S_SYNACK
+
+struct tcp_states_t {
+ int next_state[IP_VS_TCP_S_LAST];
+};
+
+static const char * tcp_state_name(int state)
+{
+ if (state >= IP_VS_TCP_S_LAST)
+ return "ERR!";
+ return tcp_state_name_table[state] ? tcp_state_name_table[state] : "?";
+}
+
+static struct tcp_states_t tcp_states [] = {
+/* INPUT */
+/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
+/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
+/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }},
+/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
+/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }},
+
+/* OUTPUT */
+/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
+/*syn*/ {{sSS, sES, sSS, sSR, sSS, sSS, sSS, sSS, sSS, sLI, sSR }},
+/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
+/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
+/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
+
+/* INPUT-ONLY */
+/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
+/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
+/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
+/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
+/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
+};
+
+static struct tcp_states_t tcp_states_dos [] = {
+/* INPUT */
+/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
+/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }},
+/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }},
+/*ack*/ {{sCL, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }},
+/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
+
+/* OUTPUT */
+/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
+/*syn*/ {{sSS, sES, sSS, sSA, sSS, sSS, sSS, sSS, sSS, sLI, sSA }},
+/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
+/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
+/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
+
+/* INPUT-ONLY */
+/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
+/*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }},
+/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
+/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
+/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
+};
+
+static struct tcp_states_t *tcp_state_table = tcp_states;
+
+
+static void tcp_timeout_change(struct ip_vs_protocol *pp, int flags)
+{
+ int on = (flags & 1); /* secure_tcp */
+
+ /*
+ ** FIXME: change secure_tcp to independent sysctl var
+ ** or make it per-service or per-app because it is valid
+ ** for most if not for all of the applications. Something
+ ** like "capabilities" (flags) for each object.
+ */
+ tcp_state_table = (on? tcp_states_dos : tcp_states);
+}
+
+static int
+tcp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to)
+{
+ return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_TCP_S_LAST,
+ tcp_state_name_table, sname, to);
+}
+
+static inline int tcp_state_idx(struct tcphdr *th)
+{
+ if (th->rst)
+ return 3;
+ if (th->syn)
+ return 0;
+ if (th->fin)
+ return 1;
+ if (th->ack)
+ return 2;
+ return -1;
+}
+
+static inline void
+set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
+ int direction, union ip_vs_tphdr h)
+{
+ int state_idx;
+ struct tcphdr *th = h.th;
+ int new_state = IP_VS_TCP_S_CLOSE;
+ int state_off = tcp_state_off[direction];
+
+ /*
+ * Update state offset to INPUT_ONLY if necessary
+ * or delete NO_OUTPUT flag if output packet detected
+ */
+ if (cp->flags & IP_VS_CONN_F_NOOUTPUT) {
+ if (state_off == TCP_DIR_OUTPUT)
+ cp->flags &= ~IP_VS_CONN_F_NOOUTPUT;
+ else
+ state_off = TCP_DIR_INPUT_ONLY;
+ }
+
+ if ((state_idx = tcp_state_idx(th)) < 0) {
+ IP_VS_DBG(8, "tcp_state_idx=%d!!!\n", state_idx);
+ goto tcp_state_out;
+ }
+
+ new_state = tcp_state_table[state_off+state_idx].next_state[cp->state];
+
+ tcp_state_out:
+ if (new_state != cp->state) {
+ struct ip_vs_dest *dest = cp->dest;
+
+ IP_VS_DBG(8, "%s %s [%c%c%c%c] %u.%u.%u.%u:%d->"
+ "%u.%u.%u.%u:%d state: %s->%s cnt:%d\n",
+ pp->name,
+ (state_off==TCP_DIR_OUTPUT)?"output ":"input ",
+ th->syn? 'S' : '.',
+ th->fin? 'F' : '.',
+ th->ack? 'A' : '.',
+ th->rst? 'R' : '.',
+ NIPQUAD(cp->daddr), ntohs(cp->dport),
+ NIPQUAD(cp->caddr), ntohs(cp->cport),
+ tcp_state_name(cp->state),
+ tcp_state_name(new_state),
+ atomic_read(&cp->refcnt));
+ if (dest) {
+ if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
+ (new_state != IP_VS_TCP_S_ESTABLISHED)) {
+ atomic_dec(&dest->activeconns);
+ atomic_inc(&dest->inactconns);
+ cp->flags |= IP_VS_CONN_F_INACTIVE;
+ } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
+ (new_state == IP_VS_TCP_S_ESTABLISHED)) {
+ atomic_inc(&dest->activeconns);
+ atomic_dec(&dest->inactconns);
+ cp->flags &= ~IP_VS_CONN_F_INACTIVE;
+ }
+ }
+ }
+
+ cp->timeout = pp->timeout_table[cp->state = new_state];
+}
+
+
+/*
+ * Handle state transitions
+ */
+static int
+tcp_state_transition(struct ip_vs_conn *cp,
+ int direction, struct iphdr *iph,
+ union ip_vs_tphdr h, struct ip_vs_protocol *pp)
+{
+ spin_lock(&cp->lock);
+ set_tcp_state(pp, cp, direction, h);
+ spin_unlock(&cp->lock);
+
+ return 1;
+}
+
+
+/*
+ * Hash table for TCP application incarnations
+ */
+#define TCP_APP_TAB_BITS 4
+#define TCP_APP_TAB_SIZE (1 << TCP_APP_TAB_BITS)
+#define TCP_APP_TAB_MASK (TCP_APP_TAB_SIZE - 1)
+
+static struct list_head tcp_apps[TCP_APP_TAB_SIZE];
+static spinlock_t tcp_app_lock = SPIN_LOCK_UNLOCKED;
+
+static inline __u16 tcp_app_hashkey(__u16 port)
+{
+ return ((port >> TCP_APP_TAB_BITS) ^ port) & TCP_APP_TAB_MASK;
+}
+
+
+static int tcp_register_app(struct ip_vs_app *inc)
+{
+ struct ip_vs_app *i;
+ struct list_head *t, *p;
+ __u16 hash, port = inc->port;
+ int ret = 0;
+
+ hash = tcp_app_hashkey(port);
+ t = &tcp_apps[hash];
+
+ spin_lock_bh(&tcp_app_lock);
+ for (p = t->next; p != t; p = p->next) {
+ i = list_entry(p, struct ip_vs_app, p_list);
+ if (i->port == port) {
+ ret = -EEXIST;
+ goto out;
+ }
+ }
+ list_add(&inc->p_list, t);
+ atomic_inc(&ip_vs_protocol_tcp.appcnt);
+
+ out:
+ spin_unlock_bh(&tcp_app_lock);
+ return ret;
+}
+
+
+static void
+tcp_unregister_app(struct ip_vs_app *inc)
+{
+ spin_lock_bh(&tcp_app_lock);
+ atomic_dec(&ip_vs_protocol_tcp.appcnt);
+ list_del(&inc->p_list);
+ spin_unlock_bh(&tcp_app_lock);
+}
+
+
+static int
+tcp_app_conn_bind(struct ip_vs_conn *cp)
+{
+ struct list_head *t, *p;
+ int hash;
+ struct ip_vs_app *inc;
+ int result = 0;
+
+ /* Default binding: bind app only for NAT */
+ if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
+ return 0;
+
+ /* Lookup application incarnations and bind the right one */
+ hash = tcp_app_hashkey(cp->vport);
+ t = &tcp_apps[hash];
+
+ spin_lock(&tcp_app_lock);
+ for (p = t->next; p != t; p = p->next) {
+ inc = list_entry(p, struct ip_vs_app, p_list);
+ if (inc->port == cp->vport) {
+ if (unlikely(!ip_vs_app_inc_get(inc)))
+ break;
+ spin_unlock(&tcp_app_lock);
+
+ IP_VS_DBG(9, "%s: Binding conn %u.%u.%u.%u:%u->"
+ "%u.%u.%u.%u:%u to app %s on port %u\n",
+ __FUNCTION__,
+ NIPQUAD(cp->caddr), ntohs(cp->cport),
+ NIPQUAD(cp->vaddr), ntohs(cp->vport),
+ inc->name, ntohs(inc->port));
+ cp->app = inc;
+ if (inc->init_conn)
+ result = inc->init_conn(inc, cp);
+ goto out;
+ }
+ }
+ spin_unlock(&tcp_app_lock);
+
+ out:
+ return result;
+}
+
+
+/*
+ * Set LISTEN timeout. (ip_vs_conn_put will setup timer)
+ */
+void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp)
+{
+ spin_lock(&cp->lock);
+ cp->state = IP_VS_TCP_S_LISTEN;
+ cp->timeout = ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_LISTEN];
+ spin_unlock(&cp->lock);
+}
+
+
+static void tcp_init(struct ip_vs_protocol *pp)
+{
+ IP_VS_INIT_HASH_TABLE(tcp_apps);
+ pp->timeout_table = tcp_timeouts;
+}
+
+
+static void tcp_exit(struct ip_vs_protocol *pp)
+{
+}
+
+
+extern void
+tcpudp_debug_packet(struct ip_vs_protocol *pp, struct iphdr *iph, char *msg);
+
+struct ip_vs_protocol ip_vs_protocol_tcp = {
+ .name = "TCP",
+ .protocol = IPPROTO_TCP,
+ .minhlen = sizeof(struct tcphdr),
+ .minhlen_icmp = 8,
+ .dont_defrag = 0,
+ .skip_nonexisting = 0,
+ .slave = 0,
+ .appcnt = ATOMIC_INIT(0),
+ .init = tcp_init,
+ .exit = tcp_exit,
+ .register_app = tcp_register_app,
+ .unregister_app = tcp_unregister_app,
+ .conn_schedule = tcp_conn_schedule,
+ .conn_in_get = tcp_conn_in_get,
+ .conn_out_get = tcp_conn_out_get,
+ .snat_handler = tcp_snat_handler,
+ .dnat_handler = tcp_dnat_handler,
+ .csum_check = tcp_csum_check,
+ .state_name = tcp_state_name,
+ .state_transition = tcp_state_transition,
+ .app_conn_bind = tcp_app_conn_bind,
+ .debug_packet = tcpudp_debug_packet,
+ .timeout_change = tcp_timeout_change,
+ .set_state_timeout = tcp_set_state_timeout,
+};
--- /dev/null
+/*
+ * ip_vs_proto_udp.c: UDP load balancing support for IPVS
+ *
+ * Version: $Id: ip_vs_proto_udp.c,v 1.3 2002/11/30 01:50:35 wensong Exp $
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ * Julian Anastasov <ja@ssi.bg>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/netfilter.h>
+
+#include <net/ip_vs.h>
+
+
+static struct ip_vs_conn *
+udp_conn_in_get(struct sk_buff *skb, struct ip_vs_protocol *pp,
+ struct iphdr *iph, union ip_vs_tphdr h, int inverse)
+{
+ struct ip_vs_conn *cp;
+
+ if (likely(!inverse)) {
+ cp = ip_vs_conn_in_get(iph->protocol,
+ iph->saddr, h.portp[0],
+ iph->daddr, h.portp[1]);
+ } else {
+ cp = ip_vs_conn_in_get(iph->protocol,
+ iph->daddr, h.portp[1],
+ iph->saddr, h.portp[0]);
+ }
+
+ return cp;
+}
+
+
+static struct ip_vs_conn *
+udp_conn_out_get(struct sk_buff *skb, struct ip_vs_protocol *pp,
+ struct iphdr *iph, union ip_vs_tphdr h, int inverse)
+{
+ struct ip_vs_conn *cp;
+
+ if (likely(!inverse)) {
+ cp = ip_vs_conn_out_get(iph->protocol,
+ iph->saddr, h.portp[0],
+ iph->daddr, h.portp[1]);
+ } else {
+ cp = ip_vs_conn_out_get(iph->protocol,
+ iph->daddr, h.portp[1],
+ iph->saddr, h.portp[0]);
+ }
+
+ return cp;
+}
+
+
+static int
+udp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp,
+ struct iphdr *iph, union ip_vs_tphdr h,
+ int *verdict, struct ip_vs_conn **cpp)
+{
+ struct ip_vs_service *svc;
+
+ if ((svc = ip_vs_service_get(skb->nfmark, iph->protocol,
+ iph->daddr, h.portp[1]))) {
+ if (ip_vs_todrop()) {
+ /*
+ * It seems that we are very loaded.
+ * We have to drop this packet :(
+ */
+ ip_vs_service_put(svc);
+ *verdict = NF_DROP;
+ return 0;
+ }
+
+ /*
+ * Let the virtual server select a real server for the
+ * incoming connection, and create a connection entry.
+ */
+ *cpp = ip_vs_schedule(svc, iph);
+ if (!*cpp) {
+ *verdict = ip_vs_leave(svc, skb, pp, h);
+ return 0;
+ }
+ ip_vs_service_put(svc);
+ }
+ return 1;
+}
+
+
+static inline void
+udp_fast_csum_update(union ip_vs_tphdr *h, u32 oldip, u32 newip,
+ u16 oldport, u16 newport)
+{
+ h->uh->check =
+ ip_vs_check_diff(~oldip, newip,
+ ip_vs_check_diff(oldport ^ 0xFFFF,
+ newport, h->uh->check));
+ if (!h->uh->check)
+ h->uh->check = 0xFFFF;
+}
+
+static int
+udp_snat_handler(struct sk_buff *skb,
+ struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
+ struct iphdr *iph, union ip_vs_tphdr h, int size)
+{
+ int ihl = (char *) h.raw - (char *) iph;
+
+ /* We are sure that we work on first fragment */
+
+ h.portp[0] = cp->vport;
+
+ /*
+ * Call application helper if needed
+ */
+ if (ip_vs_app_pkt_out(cp, skb) != 0) {
+ /* skb data has probably changed, update pointers */
+ iph = skb->nh.iph;
+ h.raw = (char*)iph + ihl;
+ size = skb->len - ihl;
+ }
+
+ /*
+ * Adjust UDP checksums
+ */
+ if (!cp->app && (h.uh->check != 0)) {
+ /* Only port and addr are changed, do fast csum update */
+ udp_fast_csum_update(&h, cp->daddr, cp->vaddr,
+ cp->dport, cp->vport);
+ if (skb->ip_summed == CHECKSUM_HW)
+ skb->ip_summed = CHECKSUM_NONE;
+ } else {
+ /* full checksum calculation */
+ h.uh->check = 0;
+ skb->csum = csum_partial(h.raw, size, 0);
+ h.uh->check = csum_tcpudp_magic(iph->saddr, iph->daddr,
+ size, iph->protocol,
+ skb->csum);
+ if (h.uh->check == 0)
+ h.uh->check = 0xFFFF;
+ IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%d)\n",
+ pp->name, h.uh->check,
+ (char*)&(h.uh->check) - (char*)h.raw);
+ }
+ return 1;
+}
+
+
+static int
+udp_dnat_handler(struct sk_buff *skb,
+ struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
+ struct iphdr *iph, union ip_vs_tphdr h, int size)
+{
+ int ihl = (char *) h.raw - (char *) iph;
+
+ /* We are sure that we work on first fragment */
+
+ h.portp[1] = cp->dport;
+
+ /*
+ * Attempt ip_vs_app call.
+ * will fix ip_vs_conn and iph ack_seq stuff
+ */
+ if (ip_vs_app_pkt_in(cp, skb) != 0) {
+ /* skb data has probably changed, update pointers */
+ iph = skb->nh.iph;
+ h.raw = (char*) iph + ihl;
+ size = skb->len - ihl;
+ }
+
+ /*
+ * Adjust UDP checksums
+ */
+ if (!cp->app && (h.uh->check != 0)) {
+ /* Only port and addr are changed, do fast csum update */
+ udp_fast_csum_update(&h, cp->vaddr, cp->daddr,
+ cp->vport, cp->dport);
+ if (skb->ip_summed == CHECKSUM_HW)
+ skb->ip_summed = CHECKSUM_NONE;
+ } else {
+ /* full checksum calculation */
+ h.uh->check = 0;
+ h.uh->check = csum_tcpudp_magic(iph->saddr, iph->daddr,
+ size, iph->protocol,
+ csum_partial(h.raw, size, 0));
+ if (h.uh->check == 0)
+ h.uh->check = 0xFFFF;
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ }
+ return 1;
+}
+
+
+static int
+udp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp,
+ struct iphdr *iph, union ip_vs_tphdr h, int size)
+{
+ if (h.uh->check != 0) {
+ switch (skb->ip_summed) {
+ case CHECKSUM_NONE:
+ skb->csum = csum_partial(h.raw, size, 0);
+ case CHECKSUM_HW:
+ if (csum_tcpudp_magic(iph->saddr, iph->daddr, size,
+ iph->protocol, skb->csum)) {
+ IP_VS_DBG_RL_PKT(0, pp, iph,
+ "Failed checksum for");
+ return 0;
+ }
+ break;
+ default:
+ /* CHECKSUM_UNNECESSARY */
+ break;
+ }
+ }
+ return 1;
+}
+
+
+/*
+ * Note: the caller guarantees that only one of register_app,
+ * unregister_app or app_conn_bind is called each time.
+ */
+
+#define UDP_APP_TAB_BITS 4
+#define UDP_APP_TAB_SIZE (1 << UDP_APP_TAB_BITS)
+#define UDP_APP_TAB_MASK (UDP_APP_TAB_SIZE - 1)
+
+static struct list_head udp_apps[UDP_APP_TAB_SIZE];
+static spinlock_t udp_app_lock = SPIN_LOCK_UNLOCKED;
+
+static inline __u16 udp_app_hashkey(__u16 port)
+{
+ return ((port >> UDP_APP_TAB_BITS) ^ port) & UDP_APP_TAB_MASK;
+}
+
+
+static int udp_register_app(struct ip_vs_app *inc)
+{
+ struct ip_vs_app *i;
+ struct list_head *t, *p;
+ __u16 hash, port = inc->port;
+ int ret = 0;
+
+ hash = udp_app_hashkey(port);
+ t = &udp_apps[hash];
+
+ spin_lock_bh(&udp_app_lock);
+ for (p = t->next; p != t; p = p->next) {
+ i = list_entry(p, struct ip_vs_app, p_list);
+ if (i->port == port) {
+ ret = -EEXIST;
+ goto out;
+ }
+ }
+ list_add(&inc->p_list, t);
+ atomic_inc(&ip_vs_protocol_udp.appcnt);
+
+ out:
+ spin_unlock_bh(&udp_app_lock);
+ return ret;
+}
+
+
+static void
+udp_unregister_app(struct ip_vs_app *inc)
+{
+ spin_lock_bh(&udp_app_lock);
+ atomic_dec(&ip_vs_protocol_udp.appcnt);
+ list_del(&inc->p_list);
+ spin_unlock_bh(&udp_app_lock);
+}
+
+
+static int udp_app_conn_bind(struct ip_vs_conn *cp)
+{
+ struct list_head *t, *p;
+ int hash;
+ struct ip_vs_app *inc;
+ int result = 0;
+
+ /* Default binding: bind app only for NAT */
+ if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
+ return 0;
+
+ /* Lookup application incarnations and bind the right one */
+ hash = udp_app_hashkey(cp->vport);
+ t = &udp_apps[hash];
+
+ spin_lock(&udp_app_lock);
+ for (p = t->next; p != t; p = p->next) {
+ inc = list_entry(p, struct ip_vs_app, p_list);
+ if (inc->port == cp->vport) {
+ if (unlikely(!ip_vs_app_inc_get(inc)))
+ break;
+ spin_unlock(&udp_app_lock);
+
+ IP_VS_DBG(9, "%s: Binding conn %u.%u.%u.%u:%u->"
+ "%u.%u.%u.%u:%u to app %s on port %u\n",
+ __FUNCTION__,
+ NIPQUAD(cp->caddr), ntohs(cp->cport),
+ NIPQUAD(cp->vaddr), ntohs(cp->vport),
+ inc->name, ntohs(inc->port));
+ cp->app = inc;
+ if (inc->init_conn)
+ result = inc->init_conn(inc, cp);
+ goto out;
+ }
+ }
+ spin_unlock(&udp_app_lock);
+
+ out:
+ return result;
+}
+
+
+static int udp_timeouts[IP_VS_UDP_S_LAST+1] = {
+ [IP_VS_UDP_S_NORMAL] = 5*60*HZ,
+ [IP_VS_UDP_S_LAST] = 2*HZ,
+};
+
+static char * udp_state_name_table[IP_VS_UDP_S_LAST+1] = {
+ [IP_VS_UDP_S_NORMAL] = "UDP",
+ [IP_VS_UDP_S_LAST] = "BUG!",
+};
+
+
+static int
+udp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to)
+{
+ return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_UDP_S_LAST,
+ udp_state_name_table, sname, to);
+}
+
+static const char * udp_state_name(int state)
+{
+ if (state >= IP_VS_UDP_S_LAST)
+ return "ERR!";
+ return udp_state_name_table[state] ? udp_state_name_table[state] : "?";
+}
+
+static int
+udp_state_transition(struct ip_vs_conn *cp,
+ int direction, struct iphdr *iph,
+ union ip_vs_tphdr h, struct ip_vs_protocol *pp)
+{
+ cp->timeout = pp->timeout_table[IP_VS_UDP_S_NORMAL];
+ return 1;
+}
+
+static void udp_init(struct ip_vs_protocol *pp)
+{
+ IP_VS_INIT_HASH_TABLE(udp_apps);
+ pp->timeout_table = udp_timeouts;
+}
+
+static void udp_exit(struct ip_vs_protocol *pp)
+{
+}
+
+
+extern void
+tcpudp_debug_packet(struct ip_vs_protocol *pp, struct iphdr *iph, char *msg);
+
+struct ip_vs_protocol ip_vs_protocol_udp = {
+ .name = "UDP",
+ .protocol = IPPROTO_UDP,
+ .minhlen = sizeof(struct udphdr),
+ .minhlen_icmp = 8,
+ .dont_defrag = 0,
+ .skip_nonexisting = 0,
+ .slave = 0,
+ .init = udp_init,
+ .exit = udp_exit,
+ .conn_schedule = udp_conn_schedule,
+ .conn_in_get = udp_conn_in_get,
+ .conn_out_get = udp_conn_out_get,
+ .snat_handler = udp_snat_handler,
+ .dnat_handler = udp_dnat_handler,
+ .csum_check = udp_csum_check,
+ .state_transition = udp_state_transition,
+ .state_name = udp_state_name,
+ .register_app = udp_register_app,
+ .unregister_app = udp_unregister_app,
+ .app_conn_bind = udp_app_conn_bind,
+ .debug_packet = tcpudp_debug_packet,
+ .timeout_change = NULL,
+ .set_state_timeout = udp_set_state_timeout,
+};
--- /dev/null
+/*
+ * IPVS: Round-Robin Scheduling module
+ *
+ * Version: $Id: ip_vs_rr.c,v 1.9 2002/09/15 08:14:08 wensong Exp $
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ * Peter Kese <peter.kese@ijs.si>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Fixes/Changes:
+ * Wensong Zhang : changed the ip_vs_rr_schedule to return dest
+ * Julian Anastasov : fixed the NULL pointer access bug in debugging
+ * Wensong Zhang : changed some comestics things for debugging
+ * Wensong Zhang : changed for the d-linked destination list
+ * Wensong Zhang : added the ip_vs_rr_update_svc
+ * Wensong Zhang : added any dest with weight=0 is quiesced
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+
+#include <net/ip_vs.h>
+
+
+static int ip_vs_rr_init_svc(struct ip_vs_service *svc)
+{
+ svc->sched_data = &svc->destinations;
+ return 0;
+}
+
+
+static int ip_vs_rr_done_svc(struct ip_vs_service *svc)
+{
+ return 0;
+}
+
+
+static int ip_vs_rr_update_svc(struct ip_vs_service *svc)
+{
+ svc->sched_data = &svc->destinations;
+ return 0;
+}
+
+
+/*
+ * Round-Robin Scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_rr_schedule(struct ip_vs_service *svc, struct iphdr *iph)
+{
+ register struct list_head *p, *q;
+ struct ip_vs_dest *dest;
+
+ IP_VS_DBG(6, "ip_vs_rr_schedule(): Scheduling...\n");
+
+ write_lock(&svc->sched_lock);
+ p = (struct list_head *)svc->sched_data;
+ p = p->next;
+ q = p;
+ do {
+ /* skip list head */
+ if (q == &svc->destinations) {
+ q = q->next;
+ continue;
+ }
+
+ dest = list_entry(q, struct ip_vs_dest, n_list);
+ if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
+ atomic_read(&dest->weight) > 0)
+ /* HIT */
+ goto out;
+ q = q->next;
+ } while (q != p);
+ write_unlock(&svc->sched_lock);
+ return NULL;
+
+ out:
+ svc->sched_data = q;
+ write_unlock(&svc->sched_lock);
+ IP_VS_DBG(6, "RR: server %u.%u.%u.%u:%u "
+ "activeconns %d refcnt %d weight %d\n",
+ NIPQUAD(dest->addr), ntohs(dest->port),
+ atomic_read(&dest->activeconns),
+ atomic_read(&dest->refcnt), atomic_read(&dest->weight));
+
+ return dest;
+}
+
+
+static struct ip_vs_scheduler ip_vs_rr_scheduler = {
+ .name = "rr", /* name */
+ .refcnt = ATOMIC_INIT(0),
+ .module = THIS_MODULE,
+ .init_service = ip_vs_rr_init_svc,
+ .done_service = ip_vs_rr_done_svc,
+ .update_service = ip_vs_rr_update_svc,
+ .schedule = ip_vs_rr_schedule,
+};
+
+static int __init ip_vs_rr_init(void)
+{
+ INIT_LIST_HEAD(&ip_vs_rr_scheduler.n_list);
+ return register_ip_vs_scheduler(&ip_vs_rr_scheduler);
+}
+
+static void __exit ip_vs_rr_cleanup(void)
+{
+ unregister_ip_vs_scheduler(&ip_vs_rr_scheduler);
+}
+
+module_init(ip_vs_rr_init);
+module_exit(ip_vs_rr_cleanup);
+MODULE_LICENSE("GPL");
--- /dev/null
+/*
+ * IPVS An implementation of the IP virtual server support for the
+ * LINUX operating system. IPVS is now implemented as a module
+ * over the Netfilter framework. IPVS can be used to build a
+ * high-performance and highly available server based on a
+ * cluster of servers.
+ *
+ * Version: $Id: ip_vs_sched.c,v 1.13 2003/05/10 03:05:23 wensong Exp $
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ * Peter Kese <peter.kese@ijs.si>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <asm/string.h>
+#include <linux/kmod.h>
+
+#include <net/ip_vs.h>
+
+/*
+ * IPVS scheduler list
+ */
+static LIST_HEAD(ip_vs_schedulers);
+
+/* lock for service table */
+static rwlock_t __ip_vs_sched_lock = RW_LOCK_UNLOCKED;
+
+
+/*
+ * Bind a service with a scheduler
+ */
+int ip_vs_bind_scheduler(struct ip_vs_service *svc,
+ struct ip_vs_scheduler *scheduler)
+{
+ int ret;
+
+ if (svc == NULL) {
+ IP_VS_ERR("ip_vs_bind_scheduler(): svc arg NULL\n");
+ return -EINVAL;
+ }
+ if (scheduler == NULL) {
+ IP_VS_ERR("ip_vs_bind_scheduler(): scheduler arg NULL\n");
+ return -EINVAL;
+ }
+
+ svc->scheduler = scheduler;
+
+ if (scheduler->init_service) {
+ ret = scheduler->init_service(svc);
+ if (ret) {
+ IP_VS_ERR("ip_vs_bind_scheduler(): init error\n");
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+
+/*
+ * Unbind a service with its scheduler
+ */
+int ip_vs_unbind_scheduler(struct ip_vs_service *svc)
+{
+ struct ip_vs_scheduler *sched;
+
+ if (svc == NULL) {
+ IP_VS_ERR("ip_vs_unbind_scheduler(): svc arg NULL\n");
+ return -EINVAL;
+ }
+
+ sched = svc->scheduler;
+ if (sched == NULL) {
+ IP_VS_ERR("ip_vs_unbind_scheduler(): svc isn't bound\n");
+ return -EINVAL;
+ }
+
+ if (sched->done_service) {
+ if (sched->done_service(svc) != 0) {
+ IP_VS_ERR("ip_vs_unbind_scheduler(): done error\n");
+ return -EINVAL;
+ }
+ }
+
+ svc->scheduler = NULL;
+ return 0;
+}
+
+
+/*
+ * Get scheduler in the scheduler list by name
+ */
+static struct ip_vs_scheduler *ip_vs_sched_getbyname(const char *sched_name)
+{
+ struct ip_vs_scheduler *sched;
+ struct list_head *l, *e;
+
+ IP_VS_DBG(2, "ip_vs_sched_getbyname(): sched_name \"%s\"\n",
+ sched_name);
+
+ l = &ip_vs_schedulers;
+
+ read_lock_bh(&__ip_vs_sched_lock);
+
+ for (e=l->next; e!=l; e=e->next) {
+ sched = list_entry(e, struct ip_vs_scheduler, n_list);
+
+ /*
+ * Test and get the modules atomically
+ */
+ if (sched->module && !try_module_get(sched->module)) {
+ /*
+ * This scheduler is just deleted
+ */
+ continue;
+ }
+ if (strcmp(sched_name, sched->name)==0) {
+ /* HIT */
+ read_unlock_bh(&__ip_vs_sched_lock);
+ return sched;
+ }
+ if (sched->module)
+ module_put(sched->module);
+ }
+
+ read_unlock_bh(&__ip_vs_sched_lock);
+ return NULL;
+}
+
+
+/*
+ * Lookup scheduler and try to load it if it doesn't exist
+ */
+struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name)
+{
+ struct ip_vs_scheduler *sched;
+
+ /*
+ * Search for the scheduler by sched_name
+ */
+ sched = ip_vs_sched_getbyname(sched_name);
+
+ /*
+ * If scheduler not found, load the module and search again
+ */
+ if (sched == NULL) {
+ char module_name[IP_VS_SCHEDNAME_MAXLEN+8];
+ sprintf(module_name,"ip_vs_%s", sched_name);
+ request_module(module_name);
+ sched = ip_vs_sched_getbyname(sched_name);
+ }
+
+ return sched;
+}
+
+void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler)
+{
+ if (scheduler->module)
+ module_put(scheduler->module);
+}
+
+
+/*
+ * Register a scheduler in the scheduler list
+ */
+int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
+{
+ struct ip_vs_scheduler *sched;
+
+ if (!scheduler) {
+ IP_VS_ERR("register_ip_vs_scheduler(): NULL arg\n");
+ return -EINVAL;
+ }
+
+ if (!scheduler->name) {
+ IP_VS_ERR("register_ip_vs_scheduler(): NULL scheduler_name\n");
+ return -EINVAL;
+ }
+
+ /* increase the module use count */
+ ip_vs_use_count_inc();
+
+ /*
+ * Make sure that the scheduler with this name doesn't exist
+ * in the scheduler list.
+ */
+ sched = ip_vs_sched_getbyname(scheduler->name);
+ if (sched) {
+ ip_vs_scheduler_put(sched);
+ ip_vs_use_count_dec();
+ IP_VS_ERR("register_ip_vs_scheduler(): [%s] scheduler "
+ "already existed in the system\n", scheduler->name);
+ return -EINVAL;
+ }
+
+ write_lock_bh(&__ip_vs_sched_lock);
+
+ if (scheduler->n_list.next != &scheduler->n_list) {
+ write_unlock_bh(&__ip_vs_sched_lock);
+ ip_vs_use_count_dec();
+ IP_VS_ERR("register_ip_vs_scheduler(): [%s] scheduler "
+ "already linked\n", scheduler->name);
+ return -EINVAL;
+ }
+
+ /*
+ * Add it into the d-linked scheduler list
+ */
+ list_add(&scheduler->n_list, &ip_vs_schedulers);
+ write_unlock_bh(&__ip_vs_sched_lock);
+
+ IP_VS_INFO("[%s] scheduler registered.\n", scheduler->name);
+
+ return 0;
+}
+
+
+/*
+ * Unregister a scheduler from the scheduler list
+ */
+int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
+{
+ if (!scheduler) {
+ IP_VS_ERR( "unregister_ip_vs_scheduler(): NULL arg\n");
+ return -EINVAL;
+ }
+
+ write_lock_bh(&__ip_vs_sched_lock);
+ if (scheduler->n_list.next == &scheduler->n_list) {
+ write_unlock_bh(&__ip_vs_sched_lock);
+ IP_VS_ERR("unregister_ip_vs_scheduler(): [%s] scheduler "
+ "is not in the list. failed\n", scheduler->name);
+ return -EINVAL;
+ }
+
+ /*
+ * Remove it from the d-linked scheduler list
+ */
+ list_del(&scheduler->n_list);
+ write_unlock_bh(&__ip_vs_sched_lock);
+
+ /* decrease the module use count */
+ ip_vs_use_count_dec();
+
+ IP_VS_INFO("[%s] scheduler unregistered.\n", scheduler->name);
+
+ return 0;
+}
--- /dev/null
+/*
+ * IPVS: Shortest Expected Delay scheduling module
+ *
+ * Version: $Id: ip_vs_sed.c,v 1.1 2003/05/10 03:06:08 wensong Exp $
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+
+/*
+ * The SED algorithm attempts to minimize each job's expected delay until
+ * completion. The expected delay that the job will experience is
+ * (Ci + 1) / Ui if sent to the ith server, in which Ci is the number of
+ * jobs on the the ith server and Ui is the fixed service rate (weight) of
+ * the ith server. The SED algorithm adopts a greedy policy that each does
+ * what is in its own best interest, i.e. to join the queue which would
+ * minimize its expected delay of completion.
+ *
+ * See the following paper for more information:
+ * A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing
+ * in large heterogeneous systems. In Proceedings IEEE INFOCOM'88,
+ * pages 986-994, 1988.
+ *
+ * Thanks must go to Marko Buuri <marko@buuri.name> for talking SED to me.
+ *
+ * The difference between SED and WLC is that SED includes the incoming
+ * job in the cost function (the increment of 1). SED may outperform
+ * WLC, while scheduling big jobs under larger heterogeneous systems
+ * (the server weight varies a lot).
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+
+#include <net/ip_vs.h>
+
+
+static int
+ip_vs_sed_init_svc(struct ip_vs_service *svc)
+{
+ return 0;
+}
+
+
+static int
+ip_vs_sed_done_svc(struct ip_vs_service *svc)
+{
+ return 0;
+}
+
+
+static int
+ip_vs_sed_update_svc(struct ip_vs_service *svc)
+{
+ return 0;
+}
+
+
+static inline unsigned int
+ip_vs_sed_dest_overhead(struct ip_vs_dest *dest)
+{
+ /*
+ * We only use the active connection number in the cost
+ * calculation here.
+ */
+ return atomic_read(&dest->activeconns) + 1;
+}
+
+
+/*
+ * Weighted Least Connection scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_sed_schedule(struct ip_vs_service *svc, struct iphdr *iph)
+{
+ register struct list_head *l, *e;
+ struct ip_vs_dest *dest, *least;
+ unsigned int loh, doh;
+
+ IP_VS_DBG(6, "ip_vs_sed_schedule(): Scheduling...\n");
+
+ /*
+ * We calculate the load of each dest server as follows:
+ * (server expected overhead) / dest->weight
+ *
+ * Remember -- no floats in kernel mode!!!
+ * The comparison of h1*w2 > h2*w1 is equivalent to that of
+ * h1/w1 > h2/w2
+ * if every weight is larger than zero.
+ *
+ * The server with weight=0 is quiesced and will not receive any
+ * new connections.
+ */
+
+ l = &svc->destinations;
+ for (e=l->next; e!=l; e=e->next) {
+ least = list_entry(e, struct ip_vs_dest, n_list);
+ if (!(least->flags & IP_VS_DEST_F_OVERLOAD) &&
+ atomic_read(&least->weight) > 0) {
+ loh = ip_vs_sed_dest_overhead(least);
+ goto nextstage;
+ }
+ }
+ return NULL;
+
+ /*
+ * Find the destination with the least load.
+ */
+ nextstage:
+ for (e=e->next; e!=l; e=e->next) {
+ dest = list_entry(e, struct ip_vs_dest, n_list);
+
+ if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+ continue;
+ doh = ip_vs_sed_dest_overhead(dest);
+ if (loh * atomic_read(&dest->weight) >
+ doh * atomic_read(&least->weight)) {
+ least = dest;
+ loh = doh;
+ }
+ }
+
+ IP_VS_DBG(6, "SED: server %u.%u.%u.%u:%u "
+ "activeconns %d refcnt %d weight %d overhead %d\n",
+ NIPQUAD(least->addr), ntohs(least->port),
+ atomic_read(&least->activeconns),
+ atomic_read(&least->refcnt),
+ atomic_read(&least->weight), loh);
+
+ return least;
+}
+
+
+static struct ip_vs_scheduler ip_vs_sed_scheduler =
+{
+ .name = "sed",
+ .refcnt = ATOMIC_INIT(0),
+ .module = THIS_MODULE,
+ .init_service = ip_vs_sed_init_svc,
+ .done_service = ip_vs_sed_done_svc,
+ .update_service = ip_vs_sed_update_svc,
+ .schedule = ip_vs_sed_schedule,
+};
+
+
+static int __init ip_vs_sed_init(void)
+{
+ INIT_LIST_HEAD(&ip_vs_sed_scheduler.n_list);
+ return register_ip_vs_scheduler(&ip_vs_sed_scheduler);
+}
+
+static void __exit ip_vs_sed_cleanup(void)
+{
+ unregister_ip_vs_scheduler(&ip_vs_sed_scheduler);
+}
+
+module_init(ip_vs_sed_init);
+module_exit(ip_vs_sed_cleanup);
+MODULE_LICENSE("GPL");
--- /dev/null
+/*
+ * IPVS: Source Hashing scheduling module
+ *
+ * Version: $Id: ip_vs_sh.c,v 1.5 2002/09/15 08:14:08 wensong Exp $
+ *
+ * Authors: Wensong Zhang <wensong@gnuchina.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+
+/*
+ * The sh algorithm is to select server by the hash key of source IP
+ * address. The pseudo code is as follows:
+ *
+ * n <- servernode[src_ip];
+ * if (n is dead) OR
+ * (n is overloaded) or (n.weight <= 0) then
+ * return NULL;
+ *
+ * return n;
+ *
+ * Notes that servernode is a 256-bucket hash table that maps the hash
+ * index derived from packet source IP address to the current server
+ * array. If the sh scheduler is used in cache cluster, it is good to
+ * combine it with cache_bypass feature. When the statically assigned
+ * server is dead or overloaded, the load balancer can bypass the cache
+ * server and send requests to the original server directly.
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+
+#include <net/ip_vs.h>
+
+
+/*
+ * IPVS SH bucket
+ */
+struct ip_vs_sh_bucket {
+ struct ip_vs_dest *dest; /* real server (cache) */
+};
+
+/*
+ * for IPVS SH entry hash table
+ */
+#ifndef CONFIG_IP_VS_SH_TAB_BITS
+#define CONFIG_IP_VS_SH_TAB_BITS 8
+#endif
+#define IP_VS_SH_TAB_BITS CONFIG_IP_VS_SH_TAB_BITS
+#define IP_VS_SH_TAB_SIZE (1 << IP_VS_SH_TAB_BITS)
+#define IP_VS_SH_TAB_MASK (IP_VS_SH_TAB_SIZE - 1)
+
+
+/*
+ * Returns hash value for IPVS SH entry
+ */
+static inline unsigned ip_vs_sh_hashkey(__u32 addr)
+{
+ return (ntohl(addr)*2654435761UL) & IP_VS_SH_TAB_MASK;
+}
+
+
+/*
+ * Get ip_vs_dest associated with supplied parameters.
+ */
+static inline struct ip_vs_dest *
+ip_vs_sh_get(struct ip_vs_sh_bucket *tbl, __u32 addr)
+{
+ return (tbl[ip_vs_sh_hashkey(addr)]).dest;
+}
+
+
+/*
+ * Assign all the hash buckets of the specified table with the service.
+ */
+static int
+ip_vs_sh_assign(struct ip_vs_sh_bucket *tbl, struct ip_vs_service *svc)
+{
+ int i;
+ struct ip_vs_sh_bucket *b;
+ struct list_head *p;
+ struct ip_vs_dest *dest;
+
+ b = tbl;
+ p = &svc->destinations;
+ for (i=0; i<IP_VS_SH_TAB_SIZE; i++) {
+ if (list_empty(p)) {
+ b->dest = NULL;
+ } else {
+ if (p == &svc->destinations)
+ p = p->next;
+
+ dest = list_entry(p, struct ip_vs_dest, n_list);
+ atomic_inc(&dest->refcnt);
+ b->dest = dest;
+
+ p = p->next;
+ }
+ b++;
+ }
+ return 0;
+}
+
+
+/*
+ * Flush all the hash buckets of the specified table.
+ */
+static void ip_vs_sh_flush(struct ip_vs_sh_bucket *tbl)
+{
+ int i;
+ struct ip_vs_sh_bucket *b;
+
+ b = tbl;
+ for (i=0; i<IP_VS_SH_TAB_SIZE; i++) {
+ if (b->dest) {
+ atomic_dec(&b->dest->refcnt);
+ b->dest = NULL;
+ }
+ b++;
+ }
+}
+
+
+static int ip_vs_sh_init_svc(struct ip_vs_service *svc)
+{
+ struct ip_vs_sh_bucket *tbl;
+
+ /* allocate the SH table for this service */
+ tbl = kmalloc(sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE,
+ GFP_ATOMIC);
+ if (tbl == NULL) {
+ IP_VS_ERR("ip_vs_sh_init_svc(): no memory\n");
+ return -ENOMEM;
+ }
+ svc->sched_data = tbl;
+ IP_VS_DBG(6, "SH hash table (memory=%dbytes) allocated for "
+ "current service\n",
+ sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE);
+
+ /* assign the hash buckets with the updated service */
+ ip_vs_sh_assign(tbl, svc);
+
+ return 0;
+}
+
+
+static int ip_vs_sh_done_svc(struct ip_vs_service *svc)
+{
+ struct ip_vs_sh_bucket *tbl = svc->sched_data;
+
+ /* got to clean up hash buckets here */
+ ip_vs_sh_flush(tbl);
+
+ /* release the table itself */
+ kfree(svc->sched_data);
+ IP_VS_DBG(6, "SH hash table (memory=%dbytes) released\n",
+ sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE);
+
+ return 0;
+}
+
+
+static int ip_vs_sh_update_svc(struct ip_vs_service *svc)
+{
+ struct ip_vs_sh_bucket *tbl = svc->sched_data;
+
+ /* got to clean up hash buckets here */
+ ip_vs_sh_flush(tbl);
+
+ /* assign the hash buckets with the updated service */
+ ip_vs_sh_assign(tbl, svc);
+
+ return 0;
+}
+
+
+/*
+ * If the dest flags is set with IP_VS_DEST_F_OVERLOAD,
+ * consider that the server is overloaded here.
+ */
+static inline int is_overloaded(struct ip_vs_dest *dest)
+{
+ return dest->flags & IP_VS_DEST_F_OVERLOAD;
+}
+
+
+/*
+ * Source Hashing scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_sh_schedule(struct ip_vs_service *svc, struct iphdr *iph)
+{
+ struct ip_vs_dest *dest;
+ struct ip_vs_sh_bucket *tbl;
+
+ IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n");
+
+ tbl = (struct ip_vs_sh_bucket *)svc->sched_data;
+ dest = ip_vs_sh_get(tbl, iph->saddr);
+ if (!dest
+ || !(dest->flags & IP_VS_DEST_F_AVAILABLE)
+ || atomic_read(&dest->weight) <= 0
+ || is_overloaded(dest)) {
+ return NULL;
+ }
+
+ IP_VS_DBG(6, "SH: source IP address %u.%u.%u.%u "
+ "--> server %u.%u.%u.%u:%d\n",
+ NIPQUAD(iph->saddr),
+ NIPQUAD(dest->addr),
+ ntohs(dest->port));
+
+ return dest;
+}
+
+
+/*
+ * IPVS SH Scheduler structure
+ */
+static struct ip_vs_scheduler ip_vs_sh_scheduler =
+{
+ .name = "sh",
+ .refcnt = ATOMIC_INIT(0),
+ .module = THIS_MODULE,
+ .init_service = ip_vs_sh_init_svc,
+ .done_service = ip_vs_sh_done_svc,
+ .update_service = ip_vs_sh_update_svc,
+ .schedule = ip_vs_sh_schedule,
+};
+
+
+static int __init ip_vs_sh_init(void)
+{
+ INIT_LIST_HEAD(&ip_vs_sh_scheduler.n_list);
+ return register_ip_vs_scheduler(&ip_vs_sh_scheduler);
+}
+
+
+static void __exit ip_vs_sh_cleanup(void)
+{
+ unregister_ip_vs_scheduler(&ip_vs_sh_scheduler);
+}
+
+
+module_init(ip_vs_sh_init);
+module_exit(ip_vs_sh_cleanup);
+MODULE_LICENSE("GPL");
--- /dev/null
+/*
+ * IPVS An implementation of the IP virtual server support for the
+ * LINUX operating system. IPVS is now implemented as a module
+ * over the NetFilter framework. IPVS can be used to build a
+ * high-performance and highly available server based on a
+ * cluster of servers.
+ *
+ * Version: $Id: ip_vs_sync.c,v 1.13 2003/06/08 09:31:19 wensong Exp $
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ * ip_vs_sync: sync connection info from master load balancer to backups
+ * through multicast
+ *
+ * Changes:
+ * Alexandre Cassen : Added master & backup support at a time.
+ * Alexandre Cassen : Added SyncID support for incoming sync
+ * messages filtering.
+ */
+
+#define __KERNEL_SYSCALLS__ /* for waitpid */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/net.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <linux/unistd.h>
+#include <linux/completion.h>
+
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/igmp.h> /* for ip_mc_join_group */
+
+#include <net/ip.h>
+#include <net/sock.h>
+#include <asm/uaccess.h> /* for get_fs and set_fs */
+
+#include <net/ip_vs.h>
+
+#define IP_VS_SYNC_GROUP 0xe0000051 /* multicast addr - 224.0.0.81 */
+#define IP_VS_SYNC_PORT 8848 /* multicast port */
+
+
+/*
+ * IPVS sync connection entry
+ */
+struct ip_vs_sync_conn {
+ __u8 reserved;
+
+ /* Protocol, addresses and port numbers */
+ __u8 protocol; /* Which protocol (TCP/UDP) */
+ __u16 cport;
+ __u16 vport;
+ __u16 dport;
+ __u32 caddr; /* client address */
+ __u32 vaddr; /* virtual address */
+ __u32 daddr; /* destination address */
+
+ /* Flags and state transition */
+ __u16 flags; /* status flags */
+ __u16 state; /* state info */
+
+ /* The sequence options start here */
+};
+
+struct ip_vs_sync_conn_options {
+ struct ip_vs_seq in_seq; /* incoming seq. struct */
+ struct ip_vs_seq out_seq; /* outgoing seq. struct */
+};
+
+#define IP_VS_SYNC_CONN_TIMEOUT (3*60*HZ)
+#define SIMPLE_CONN_SIZE (sizeof(struct ip_vs_sync_conn))
+#define FULL_CONN_SIZE \
+(sizeof(struct ip_vs_sync_conn) + sizeof(struct ip_vs_sync_conn_options))
+
+
+/*
+ The master mulitcasts messages to the backup load balancers in the
+ following format.
+
+ 0 1 2 3
+ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Count Conns | SyncID | Size |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | |
+ | IPVS Sync Connection (1) |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | . |
+ | . |
+ | . |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | |
+ | IPVS Sync Connection (n) |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*/
+
+#define SYNC_MESG_HEADER_LEN 4
+
+struct ip_vs_sync_mesg {
+ __u8 nr_conns;
+ __u8 syncid;
+ __u16 size;
+
+ /* ip_vs_sync_conn entries start here */
+};
+
+/* the maximum length of sync (sending/receiving) message */
+static int sync_send_mesg_maxlen;
+static int sync_recv_mesg_maxlen;
+
+struct ip_vs_sync_buff {
+ struct list_head list;
+ unsigned long firstuse;
+
+ /* pointers for the message data */
+ struct ip_vs_sync_mesg *mesg;
+ unsigned char *head;
+ unsigned char *end;
+};
+
+
+/* the sync_buff list head and the lock */
+static LIST_HEAD(ip_vs_sync_queue);
+static spinlock_t ip_vs_sync_lock = SPIN_LOCK_UNLOCKED;
+
+/* current sync_buff for accepting new conn entries */
+static struct ip_vs_sync_buff *curr_sb = NULL;
+static spinlock_t curr_sb_lock = SPIN_LOCK_UNLOCKED;
+
+/* ipvs sync daemon state */
+volatile int ip_vs_sync_state = IP_VS_STATE_NONE;
+volatile int ip_vs_master_syncid = 0;
+volatile int ip_vs_backup_syncid = 0;
+
+/* multicast interface name */
+char ip_vs_master_mcast_ifn[IP_VS_IFNAME_MAXLEN];
+char ip_vs_backup_mcast_ifn[IP_VS_IFNAME_MAXLEN];
+
+/* multicast addr */
+static struct sockaddr_in mcast_addr;
+
+
+static inline void sb_queue_tail(struct ip_vs_sync_buff *sb)
+{
+ spin_lock(&ip_vs_sync_lock);
+ list_add_tail(&sb->list, &ip_vs_sync_queue);
+ spin_unlock(&ip_vs_sync_lock);
+}
+
+static inline struct ip_vs_sync_buff * sb_dequeue(void)
+{
+ struct ip_vs_sync_buff *sb;
+
+ spin_lock_bh(&ip_vs_sync_lock);
+ if (list_empty(&ip_vs_sync_queue)) {
+ sb = NULL;
+ } else {
+ sb = list_entry(ip_vs_sync_queue.next,
+ struct ip_vs_sync_buff,
+ list);
+ list_del(&sb->list);
+ }
+ spin_unlock_bh(&ip_vs_sync_lock);
+
+ return sb;
+}
+
+static inline struct ip_vs_sync_buff * ip_vs_sync_buff_create(void)
+{
+ struct ip_vs_sync_buff *sb;
+
+ if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC)))
+ return NULL;
+
+ if (!(sb->mesg=kmalloc(sync_send_mesg_maxlen, GFP_ATOMIC))) {
+ kfree(sb);
+ return NULL;
+ }
+ sb->mesg->nr_conns = 0;
+ sb->mesg->syncid = ip_vs_master_syncid;
+ sb->mesg->size = 4;
+ sb->head = (unsigned char *)sb->mesg + 4;
+ sb->end = (unsigned char *)sb->mesg + sync_send_mesg_maxlen;
+ sb->firstuse = jiffies;
+ return sb;
+}
+
+static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb)
+{
+ kfree(sb->mesg);
+ kfree(sb);
+}
+
+/*
+ * Get the current sync buffer if it has been created for more
+ * than the specified time or the specified time is zero.
+ */
+static inline struct ip_vs_sync_buff *
+get_curr_sync_buff(unsigned long time)
+{
+ struct ip_vs_sync_buff *sb;
+
+ spin_lock_bh(&curr_sb_lock);
+ if (curr_sb &&
+ (jiffies - curr_sb->firstuse > time || time == 0)) {
+ sb = curr_sb;
+ curr_sb = NULL;
+ } else
+ sb = NULL;
+ spin_unlock_bh(&curr_sb_lock);
+ return sb;
+}
+
+
+/*
+ * Add an ip_vs_conn information into the current sync_buff.
+ * Called by ip_vs_in.
+ */
+void ip_vs_sync_conn(struct ip_vs_conn *cp)
+{
+ struct ip_vs_sync_mesg *m;
+ struct ip_vs_sync_conn *s;
+ int len;
+
+ spin_lock(&curr_sb_lock);
+ if (!curr_sb) {
+ if (!(curr_sb=ip_vs_sync_buff_create())) {
+ spin_unlock(&curr_sb_lock);
+ IP_VS_ERR("ip_vs_sync_buff_create failed.\n");
+ return;
+ }
+ }
+
+ len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE :
+ SIMPLE_CONN_SIZE;
+ m = curr_sb->mesg;
+ s = (struct ip_vs_sync_conn *)curr_sb->head;
+
+ /* copy members */
+ s->protocol = cp->protocol;
+ s->cport = cp->cport;
+ s->vport = cp->vport;
+ s->dport = cp->dport;
+ s->caddr = cp->caddr;
+ s->vaddr = cp->vaddr;
+ s->daddr = cp->daddr;
+ s->flags = htons(cp->flags & ~IP_VS_CONN_F_HASHED);
+ s->state = htons(cp->state);
+ if (cp->flags & IP_VS_CONN_F_SEQ_MASK) {
+ struct ip_vs_sync_conn_options *opt =
+ (struct ip_vs_sync_conn_options *)&s[1];
+ memcpy(opt, &cp->in_seq, sizeof(*opt));
+ }
+
+ m->nr_conns++;
+ m->size += len;
+ curr_sb->head += len;
+
+ /* check if there is a space for next one */
+ if (curr_sb->head+FULL_CONN_SIZE > curr_sb->end) {
+ sb_queue_tail(curr_sb);
+ curr_sb = NULL;
+ }
+ spin_unlock(&curr_sb_lock);
+
+ /* synchronize its controller if it has */
+ if (cp->control)
+ ip_vs_sync_conn(cp->control);
+}
+
+
+/*
+ * Process received multicast message and create the corresponding
+ * ip_vs_conn entries.
+ */
+static void ip_vs_process_message(const char *buffer, const size_t buflen)
+{
+ struct ip_vs_sync_mesg *m = (struct ip_vs_sync_mesg *)buffer;
+ struct ip_vs_sync_conn *s;
+ struct ip_vs_sync_conn_options *opt;
+ struct ip_vs_conn *cp;
+ char *p;
+ int i;
+
+ if (buflen != m->size) {
+ IP_VS_ERR("bogus message\n");
+ return;
+ }
+
+ /* SyncID sanity check */
+ if (ip_vs_backup_syncid != 0 && m->syncid != ip_vs_backup_syncid) {
+ IP_VS_DBG(7, "Ignoring incoming msg with syncid = %d\n",
+ m->syncid);
+ return;
+ }
+
+ p = (char *)buffer + sizeof(struct ip_vs_sync_mesg);
+ for (i=0; i<m->nr_conns; i++) {
+ s = (struct ip_vs_sync_conn *)p;
+ cp = ip_vs_conn_in_get(s->protocol,
+ s->caddr, s->cport,
+ s->vaddr, s->vport);
+ if (!cp) {
+ cp = ip_vs_conn_new(s->protocol,
+ s->caddr, s->cport,
+ s->vaddr, s->vport,
+ s->daddr, s->dport,
+ ntohs(s->flags), NULL);
+ if (!cp) {
+ IP_VS_ERR("ip_vs_conn_new failed\n");
+ return;
+ }
+ cp->state = ntohs(s->state);
+ } else if (!cp->dest) {
+ /* it is an entry created by the synchronization */
+ cp->state = ntohs(s->state);
+ cp->flags = ntohs(s->flags) | IP_VS_CONN_F_HASHED;
+ } /* Note that we don't touch its state and flags
+ if it is a normal entry. */
+
+ if (ntohs(s->flags) & IP_VS_CONN_F_SEQ_MASK) {
+ opt = (struct ip_vs_sync_conn_options *)&s[1];
+ memcpy(&cp->in_seq, opt, sizeof(*opt));
+ p += FULL_CONN_SIZE;
+ } else
+ p += SIMPLE_CONN_SIZE;
+
+ atomic_set(&cp->in_pkts, sysctl_ip_vs_sync_threshold[0]);
+ cp->timeout = IP_VS_SYNC_CONN_TIMEOUT;
+ ip_vs_conn_put(cp);
+
+ if (p > buffer+buflen) {
+ IP_VS_ERR("bogus message\n");
+ return;
+ }
+ }
+}
+
+
+/*
+ * Setup loopback of outgoing multicasts on a sending socket
+ */
+static void set_mcast_loop(struct sock *sk, u_char loop)
+{
+ struct inet_opt *inet = inet_sk(sk);
+
+ /* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */
+ lock_sock(sk);
+ inet->mc_loop = loop ? 1 : 0;
+ release_sock(sk);
+}
+
+/*
+ * Specify TTL for outgoing multicasts on a sending socket
+ */
+static void set_mcast_ttl(struct sock *sk, u_char ttl)
+{
+ struct inet_opt *inet = inet_sk(sk);
+
+ /* setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); */
+ lock_sock(sk);
+ inet->mc_ttl = ttl;
+ release_sock(sk);
+}
+
+/*
+ * Specifiy default interface for outgoing multicasts
+ */
+static int set_mcast_if(struct sock *sk, char *ifname)
+{
+ struct net_device *dev;
+ struct inet_opt *inet = inet_sk(sk);
+
+ if ((dev = __dev_get_by_name(ifname)) == NULL)
+ return -ENODEV;
+
+ if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
+ return -EINVAL;
+
+ lock_sock(sk);
+ inet->mc_index = dev->ifindex;
+ /* inet->mc_addr = 0; */
+ release_sock(sk);
+
+ return 0;
+}
+
+
+/*
+ * Set the maximum length of sync message according to the
+ * specified interface's MTU.
+ */
+static int set_sync_mesg_maxlen(int sync_state)
+{
+ struct net_device *dev;
+ int num;
+
+ if (sync_state == IP_VS_STATE_MASTER) {
+ if ((dev = __dev_get_by_name(ip_vs_master_mcast_ifn)) == NULL)
+ return -ENODEV;
+
+ num = (dev->mtu - sizeof(struct iphdr) -
+ sizeof(struct udphdr) -
+ SYNC_MESG_HEADER_LEN - 20) / SIMPLE_CONN_SIZE;
+ sync_send_mesg_maxlen =
+ SYNC_MESG_HEADER_LEN + SIMPLE_CONN_SIZE * num;
+ IP_VS_DBG(7, "setting the maximum length of sync sending "
+ "message %d.\n", sync_send_mesg_maxlen);
+ } else if (sync_state == IP_VS_STATE_BACKUP) {
+ if ((dev = __dev_get_by_name(ip_vs_backup_mcast_ifn)) == NULL)
+ return -ENODEV;
+
+ sync_recv_mesg_maxlen = dev->mtu -
+ sizeof(struct iphdr) - sizeof(struct udphdr);
+ IP_VS_DBG(7, "setting the maximum length of sync receiving "
+ "message %d.\n", sync_recv_mesg_maxlen);
+ }
+
+ return 0;
+}
+
+
+/*
+ * Join a multicast group.
+ * the group is specified by a class D multicast address 224.0.0.0/8
+ * in the in_addr structure passed in as a parameter.
+ */
+static int
+join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname)
+{
+ struct ip_mreqn mreq;
+ struct net_device *dev;
+ int ret;
+
+ memset(&mreq, 0, sizeof(mreq));
+ memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr));
+
+ if ((dev = __dev_get_by_name(ifname)) == NULL)
+ return -ENODEV;
+ if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
+ return -EINVAL;
+
+ mreq.imr_ifindex = dev->ifindex;
+
+ lock_sock(sk);
+ ret = ip_mc_join_group(sk, &mreq);
+ release_sock(sk);
+
+ return ret;
+}
+
+
+static int bind_mcastif_addr(struct socket *sock, char *ifname)
+{
+ struct net_device *dev;
+ u32 addr;
+ struct sockaddr_in sin;
+
+ if ((dev = __dev_get_by_name(ifname)) == NULL)
+ return -ENODEV;
+
+ addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
+ if (!addr)
+ IP_VS_ERR("You probably need to specify IP address on "
+ "multicast interface.\n");
+
+ IP_VS_DBG(7, "binding socket with (%s) %u.%u.%u.%u\n",
+ ifname, NIPQUAD(addr));
+
+ /* Now bind the socket with the address of multicast interface */
+ sin.sin_family = AF_INET;
+ sin.sin_addr.s_addr = addr;
+ sin.sin_port = 0;
+
+ return sock->ops->bind(sock, (struct sockaddr*)&sin, sizeof(sin));
+}
+
+/*
+ * Set up sending multicast socket over UDP
+ */
+static struct socket * make_send_sock(void)
+{
+ struct socket *sock;
+
+ /* First create a socket */
+ if (sock_create(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock) < 0) {
+ IP_VS_ERR("Error during creation of socket; terminating\n");
+ return NULL;
+ }
+
+ if (set_mcast_if(sock->sk, ip_vs_master_mcast_ifn) < 0) {
+ IP_VS_ERR("Error setting outbound mcast interface\n");
+ goto error;
+ }
+
+ set_mcast_loop(sock->sk, 0);
+ set_mcast_ttl(sock->sk, 1);
+
+ if (bind_mcastif_addr(sock, ip_vs_master_mcast_ifn) < 0) {
+ IP_VS_ERR("Error binding address of the mcast interface\n");
+ goto error;
+ }
+
+ if (sock->ops->connect(sock,
+ (struct sockaddr*)&mcast_addr,
+ sizeof(struct sockaddr), 0) < 0) {
+ IP_VS_ERR("Error connecting to the multicast addr\n");
+ goto error;
+ }
+
+ return sock;
+
+ error:
+ sock_release(sock);
+ return NULL;
+}
+
+
+/*
+ * Set up receiving multicast socket over UDP
+ */
+static struct socket * make_receive_sock(void)
+{
+ struct socket *sock;
+
+ /* First create a socket */
+ if (sock_create(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock) < 0) {
+ IP_VS_ERR("Error during creation of socket; terminating\n");
+ return NULL;
+ }
+
+ /* it is equivalent to the REUSEADDR option in user-space */
+ sock->sk->sk_reuse = 1;
+
+ if (sock->ops->bind(sock,
+ (struct sockaddr*)&mcast_addr,
+ sizeof(struct sockaddr)) < 0) {
+ IP_VS_ERR("Error binding to the multicast addr\n");
+ goto error;
+ }
+
+ /* join the multicast group */
+ if (join_mcast_group(sock->sk,
+ (struct in_addr*)&mcast_addr.sin_addr,
+ ip_vs_backup_mcast_ifn) < 0) {
+ IP_VS_ERR("Error joining to the multicast group\n");
+ goto error;
+ }
+
+ return sock;
+
+ error:
+ sock_release(sock);
+ return NULL;
+}
+
+
+static int
+ip_vs_send_async(struct socket *sock, const char *buffer, const size_t length)
+{
+ struct msghdr msg;
+ mm_segment_t oldfs;
+ struct iovec iov;
+ int len;
+
+ EnterFunction(7);
+ iov.iov_base = (void *)buffer;
+ iov.iov_len = length;
+ msg.msg_name = 0;
+ msg.msg_namelen = 0;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = NULL;
+ msg.msg_controllen = 0;
+ msg.msg_flags = MSG_DONTWAIT|MSG_NOSIGNAL;
+
+ oldfs = get_fs(); set_fs(KERNEL_DS);
+ len = sock_sendmsg(sock, &msg, (size_t)(length));
+ set_fs(oldfs);
+
+ LeaveFunction(7);
+ return len;
+}
+
+
+static int
+ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen)
+{
+ struct msghdr msg;
+ struct iovec iov;
+ int len;
+ mm_segment_t oldfs;
+
+ EnterFunction(7);
+
+ /* Receive a packet */
+ iov.iov_base = buffer;
+ iov.iov_len = (size_t)buflen;
+ msg.msg_name = 0;
+ msg.msg_namelen = 0;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = NULL;
+ msg.msg_controllen = 0;
+ msg.msg_flags = 0;
+
+ oldfs = get_fs(); set_fs(KERNEL_DS);
+ len = sock_recvmsg(sock, &msg, buflen, 0);
+ set_fs(oldfs);
+
+ if (len < 0)
+ return -1;
+
+ LeaveFunction(7);
+ return len;
+}
+
+
+static int errno;
+
+static DECLARE_WAIT_QUEUE_HEAD(sync_wait);
+static pid_t sync_master_pid = 0;
+static pid_t sync_backup_pid = 0;
+
+static DECLARE_WAIT_QUEUE_HEAD(stop_sync_wait);
+static int stop_master_sync = 0;
+static int stop_backup_sync = 0;
+
+static void sync_master_loop(void)
+{
+ struct socket *sock;
+ struct ip_vs_sync_buff *sb;
+ struct ip_vs_sync_mesg *m;
+
+ /* create the sending multicast socket */
+ sock = make_send_sock();
+ if (!sock)
+ return;
+
+ IP_VS_INFO("sync thread started: state = MASTER, mcast_ifn = %s, "
+ "syncid = %d\n",
+ ip_vs_master_mcast_ifn, ip_vs_master_syncid);
+
+ for (;;) {
+ while ((sb=sb_dequeue())) {
+ m = sb->mesg;
+ if (ip_vs_send_async(sock, (char *)m,
+ m->size) != m->size)
+ IP_VS_ERR("ip_vs_send_async error\n");
+ ip_vs_sync_buff_release(sb);
+ }
+
+ /* check if entries stay in curr_sb for 2 seconds */
+ if ((sb = get_curr_sync_buff(2*HZ))) {
+ m = sb->mesg;
+ if (ip_vs_send_async(sock, (char *)m,
+ m->size) != m->size)
+ IP_VS_ERR("ip_vs_send_async error\n");
+ ip_vs_sync_buff_release(sb);
+ }
+
+ if (stop_master_sync)
+ break;
+
+ __set_current_state(TASK_INTERRUPTIBLE);
+ schedule_timeout(HZ);
+ __set_current_state(TASK_RUNNING);
+ }
+
+ /* clean up the sync_buff queue */
+ while ((sb=sb_dequeue())) {
+ ip_vs_sync_buff_release(sb);
+ }
+
+ /* clean up the current sync_buff */
+ if ((sb = get_curr_sync_buff(0))) {
+ ip_vs_sync_buff_release(sb);
+ }
+
+ /* release the sending multicast socket */
+ sock_release(sock);
+}
+
+
+static void sync_backup_loop(void)
+{
+ struct socket *sock;
+ char *buf;
+ int len;
+
+ if (!(buf = kmalloc(sync_recv_mesg_maxlen, GFP_ATOMIC))) {
+ IP_VS_ERR("sync_backup_loop: kmalloc error\n");
+ return;
+ }
+
+ /* create the receiving multicast socket */
+ sock = make_receive_sock();
+ if (!sock)
+ goto out;
+
+ IP_VS_INFO("sync thread started: state = BACKUP, mcast_ifn = %s, "
+ "syncid = %d\n",
+ ip_vs_backup_mcast_ifn, ip_vs_backup_syncid);
+
+ for (;;) {
+ /* do you have data now? */
+ while (!skb_queue_empty(&(sock->sk->sk_receive_queue))) {
+ if ((len =
+ ip_vs_receive(sock, buf,
+ sync_recv_mesg_maxlen)) <= 0) {
+ IP_VS_ERR("receiving message error\n");
+ break;
+ }
+ /* disable bottom half, because it accessed the data
+ shared by softirq while getting/creating conns */
+ local_bh_disable();
+ ip_vs_process_message(buf, len);
+ local_bh_enable();
+ }
+
+ if (stop_backup_sync)
+ break;
+
+ __set_current_state(TASK_INTERRUPTIBLE);
+ schedule_timeout(HZ);
+ __set_current_state(TASK_RUNNING);
+ }
+
+ /* release the sending multicast socket */
+ sock_release(sock);
+
+ out:
+ kfree(buf);
+}
+
+
+static void set_sync_pid(int sync_state, pid_t sync_pid)
+{
+ if (sync_state == IP_VS_STATE_MASTER)
+ sync_master_pid = sync_pid;
+ else if (sync_state == IP_VS_STATE_BACKUP)
+ sync_backup_pid = sync_pid;
+}
+
+static void set_stop_sync(int sync_state, int set)
+{
+ if (sync_state == IP_VS_STATE_MASTER)
+ stop_master_sync = set;
+ else if (sync_state == IP_VS_STATE_BACKUP)
+ stop_backup_sync = set;
+ else {
+ stop_master_sync = set;
+ stop_backup_sync = set;
+ }
+}
+
+static int sync_thread(void *startup)
+{
+ DECLARE_WAITQUEUE(wait, current);
+ mm_segment_t oldmm;
+ int state;
+ const char *name;
+
+ /* increase the module use count */
+ ip_vs_use_count_inc();
+
+ if (ip_vs_sync_state & IP_VS_STATE_MASTER && !sync_master_pid) {
+ state = IP_VS_STATE_MASTER;
+ name = "ipvs syncmaster";
+ } else if (ip_vs_sync_state & IP_VS_STATE_BACKUP && !sync_backup_pid) {
+ state = IP_VS_STATE_BACKUP;
+ name = "ipvs syncbackup";
+ } else {
+ IP_VS_BUG();
+ ip_vs_use_count_dec();
+ return -EINVAL;
+ }
+
+ daemonize(name);
+
+ oldmm = get_fs();
+ set_fs(KERNEL_DS);
+
+ /* Block all signals */
+ spin_lock_irq(¤t->sighand->siglock);
+ siginitsetinv(¤t->blocked, 0);
+ recalc_sigpending();
+ spin_unlock_irq(¤t->sighand->siglock);
+
+ /* set the maximum length of sync message */
+ set_sync_mesg_maxlen(state);
+
+ /* set up multicast address */
+ mcast_addr.sin_family = AF_INET;
+ mcast_addr.sin_port = htons(IP_VS_SYNC_PORT);
+ mcast_addr.sin_addr.s_addr = htonl(IP_VS_SYNC_GROUP);
+
+ add_wait_queue(&sync_wait, &wait);
+
+ set_sync_pid(state, current->pid);
+ complete((struct completion *)startup);
+
+ /* processing master/backup loop here */
+ if (state == IP_VS_STATE_MASTER)
+ sync_master_loop();
+ else if (state == IP_VS_STATE_BACKUP)
+ sync_backup_loop();
+ else IP_VS_BUG();
+
+ remove_wait_queue(&sync_wait, &wait);
+
+ /* thread exits */
+ set_sync_pid(state, 0);
+ IP_VS_INFO("sync thread stopped!\n");
+
+ set_fs(oldmm);
+
+ /* decrease the module use count */
+ ip_vs_use_count_dec();
+
+ set_stop_sync(state, 0);
+ wake_up(&stop_sync_wait);
+
+ return 0;
+}
+
+
+static int fork_sync_thread(void *startup)
+{
+ /* fork the sync thread here, then the parent process of the
+ sync thread is the init process after this thread exits. */
+ if (kernel_thread(sync_thread, startup, 0) < 0)
+ IP_VS_BUG();
+ return 0;
+}
+
+
+int start_sync_thread(int state, char *mcast_ifn, __u8 syncid)
+{
+ DECLARE_COMPLETION(startup);
+ pid_t pid;
+ int waitpid_result;
+
+ if ((state == IP_VS_STATE_MASTER && sync_master_pid) ||
+ (state == IP_VS_STATE_BACKUP && sync_backup_pid))
+ return -EEXIST;
+
+ IP_VS_DBG(7, "%s: pid %d\n", __FUNCTION__, current->pid);
+ IP_VS_DBG(7, "Each ip_vs_sync_conn entry need %d bytes\n",
+ sizeof(struct ip_vs_sync_conn));
+
+ ip_vs_sync_state |= state;
+ if (state == IP_VS_STATE_MASTER) {
+ strcpy(ip_vs_master_mcast_ifn, mcast_ifn);
+ ip_vs_master_syncid = syncid;
+ } else {
+ strcpy(ip_vs_backup_mcast_ifn, mcast_ifn);
+ ip_vs_backup_syncid = syncid;
+ }
+
+ if ((pid = kernel_thread(fork_sync_thread, &startup, 0)) < 0)
+ IP_VS_BUG();
+
+ if ((waitpid_result = waitpid(pid, NULL, __WCLONE)) != pid) {
+ IP_VS_ERR("%s: waitpid(%d,...) failed, errno %d\n",
+ __FUNCTION__, pid, -waitpid_result);
+ }
+
+ wait_for_completion(&startup);
+
+ return 0;
+}
+
+
+int stop_sync_thread(int state)
+{
+ DECLARE_WAITQUEUE(wait, current);
+
+ if ((state == IP_VS_STATE_MASTER && !sync_master_pid) ||
+ (state == IP_VS_STATE_BACKUP && !sync_backup_pid))
+ return -ESRCH;
+
+ IP_VS_DBG(7, "%s: pid %d\n", __FUNCTION__, current->pid);
+ IP_VS_INFO("stopping sync thread %d ...\n",
+ (state == IP_VS_STATE_MASTER) ? sync_master_pid : sync_backup_pid);
+
+ __set_current_state(TASK_UNINTERRUPTIBLE);
+ add_wait_queue(&stop_sync_wait, &wait);
+ set_stop_sync(state, 1);
+ ip_vs_sync_state -= state;
+ wake_up(&sync_wait);
+ schedule();
+ __set_current_state(TASK_RUNNING);
+ remove_wait_queue(&stop_sync_wait, &wait);
+
+ /* Note: no need to reap the sync thread, because its parent
+ process is the init process */
+
+ if ((state == IP_VS_STATE_MASTER && stop_master_sync) ||
+ (state == IP_VS_STATE_BACKUP && stop_backup_sync))
+ IP_VS_BUG();
+
+ return 0;
+}
--- /dev/null
+/*
+ * IPVS An implementation of the IP virtual server support for the
+ * LINUX operating system. IPVS is now implemented as a module
+ * over the Netfilter framework. IPVS can be used to build a
+ * high-performance and highly available server based on a
+ * cluster of servers.
+ *
+ * Version: $Id: ip_vs_timer.c,v 1.11 2003/06/08 09:31:19 wensong Exp $
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ * Julian Anastasov <ja@ssi.bg>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/timer.h>
+
+#include <net/ip_vs.h>
+
+/*
+ * The following block implements slow timers for IPVS, most code is stolen
+ * from linux/kernel/timer.c.
+ * Slow timer is used to avoid the overhead of cascading timers, when lots
+ * of connection entries (>50,000) are cluttered in the system.
+ */
+#define SHIFT_BITS 6
+#define TVN_BITS 8
+#define TVR_BITS 10
+#define TVN_SIZE (1 << TVN_BITS)
+#define TVR_SIZE (1 << TVR_BITS)
+#define TVN_MASK (TVN_SIZE - 1)
+#define TVR_MASK (TVR_SIZE - 1)
+
+struct sltimer_vec {
+ int index;
+ struct list_head vec[TVN_SIZE];
+};
+
+struct sltimer_vec_root {
+ int index;
+ struct list_head vec[TVR_SIZE];
+};
+
+static struct sltimer_vec sltv3 = { 0 };
+static struct sltimer_vec sltv2 = { 0 };
+static struct sltimer_vec_root sltv1 = { 0 };
+
+static struct sltimer_vec * const sltvecs[] = {
+ (struct sltimer_vec *)&sltv1, &sltv2, &sltv3
+};
+
+#define NOOF_SLTVECS (sizeof(sltvecs) / sizeof(sltvecs[0]))
+
+static void init_sltimervecs(void)
+{
+ int i;
+
+ for (i = 0; i < TVN_SIZE; i++) {
+ INIT_LIST_HEAD(sltv3.vec + i);
+ INIT_LIST_HEAD(sltv2.vec + i);
+ }
+ for (i = 0; i < TVR_SIZE; i++)
+ INIT_LIST_HEAD(sltv1.vec + i);
+}
+
+static unsigned long sltimer_jiffies = 0;
+
+static inline void internal_add_sltimer(struct sltimer_list *timer)
+{
+ /*
+ * must hold the sltimer lock when calling this
+ */
+ unsigned long expires = timer->expires;
+ unsigned long idx = expires - sltimer_jiffies;
+ struct list_head * vec;
+
+ if (idx < 1 << (SHIFT_BITS + TVR_BITS)) {
+ int i = (expires >> SHIFT_BITS) & TVR_MASK;
+ vec = sltv1.vec + i;
+ } else if (idx < 1 << (SHIFT_BITS + TVR_BITS + TVN_BITS)) {
+ int i = (expires >> (SHIFT_BITS+TVR_BITS)) & TVN_MASK;
+ vec = sltv2.vec + i;
+ } else if ((signed long) idx < 0) {
+ /*
+ * can happen if you add a timer with expires == jiffies,
+ * or you set a timer to go off in the past
+ */
+ vec = sltv1.vec + sltv1.index;
+ } else if (idx <= 0xffffffffUL) {
+ int i = (expires >> (SHIFT_BITS+TVR_BITS+TVN_BITS)) & TVN_MASK;
+ vec = sltv3.vec + i;
+ } else {
+ /* Can only get here on architectures with 64-bit jiffies */
+ INIT_LIST_HEAD(&timer->list);
+ }
+ /*
+ * Timers are FIFO!
+ */
+ list_add(&timer->list, vec->prev);
+}
+
+
+static spinlock_t __ip_vs_sltimerlist_lock = SPIN_LOCK_UNLOCKED;
+
+void add_sltimer(struct sltimer_list *timer)
+{
+ spin_lock(&__ip_vs_sltimerlist_lock);
+ if (timer->list.next)
+ goto bug;
+ internal_add_sltimer(timer);
+ out:
+ spin_unlock(&__ip_vs_sltimerlist_lock);
+ return;
+
+ bug:
+ printk("bug: kernel sltimer added twice at %p.\n",
+ __builtin_return_address(0));
+ goto out;
+}
+
+static inline int detach_sltimer(struct sltimer_list *timer)
+{
+ if (!sltimer_pending(timer))
+ return 0;
+ list_del(&timer->list);
+ return 1;
+}
+
+void mod_sltimer(struct sltimer_list *timer, unsigned long expires)
+{
+ int ret;
+
+ spin_lock(&__ip_vs_sltimerlist_lock);
+ timer->expires = expires;
+ ret = detach_sltimer(timer);
+ internal_add_sltimer(timer);
+ spin_unlock(&__ip_vs_sltimerlist_lock);
+}
+
+int del_sltimer(struct sltimer_list * timer)
+{
+ int ret;
+
+ spin_lock(&__ip_vs_sltimerlist_lock);
+ ret = detach_sltimer(timer);
+ timer->list.next = timer->list.prev = 0;
+ spin_unlock(&__ip_vs_sltimerlist_lock);
+ return ret;
+}
+
+
+static inline void cascade_sltimers(struct sltimer_vec *tv)
+{
+ /*
+ * cascade all the timers from tv up one level
+ */
+ struct list_head *head, *curr, *next;
+
+ head = tv->vec + tv->index;
+ curr = head->next;
+
+ /*
+ * We are removing _all_ timers from the list, so we don't have to
+ * detach them individually, just clear the list afterwards.
+ */
+ while (curr != head) {
+ struct sltimer_list *tmp;
+
+ tmp = list_entry(curr, struct sltimer_list, list);
+ next = curr->next;
+ list_del(curr); // not needed
+ internal_add_sltimer(tmp);
+ curr = next;
+ }
+ INIT_LIST_HEAD(head);
+ tv->index = (tv->index + 1) & TVN_MASK;
+}
+
+static inline void run_sltimer_list(void)
+{
+ spin_lock(&__ip_vs_sltimerlist_lock);
+ while ((long)(jiffies - sltimer_jiffies) >= 0) {
+ struct list_head *head, *curr;
+ if (!sltv1.index) {
+ int n = 1;
+ do {
+ cascade_sltimers(sltvecs[n]);
+ } while (sltvecs[n]->index == 1 && ++n < NOOF_SLTVECS);
+ }
+ repeat:
+ head = sltv1.vec + sltv1.index;
+ curr = head->next;
+ if (curr != head) {
+ struct sltimer_list *timer;
+ void (*fn)(unsigned long);
+ unsigned long data;
+
+ timer = list_entry(curr, struct sltimer_list, list);
+ fn = timer->function;
+ data= timer->data;
+
+ detach_sltimer(timer);
+ timer->list.next = timer->list.prev = NULL;
+ spin_unlock(&__ip_vs_sltimerlist_lock);
+ fn(data);
+ spin_lock(&__ip_vs_sltimerlist_lock);
+ goto repeat;
+ }
+ sltimer_jiffies += 1<<SHIFT_BITS;
+ sltv1.index = (sltv1.index + 1) & TVR_MASK;
+ }
+ spin_unlock(&__ip_vs_sltimerlist_lock);
+}
+
+static struct timer_list slow_timer;
+
+/*
+ * Slow timer handler is activated every second
+ */
+#define SLTIMER_PERIOD 1*HZ
+
+static void sltimer_handler(unsigned long data)
+{
+ run_sltimer_list();
+
+ update_defense_level();
+ if (atomic_read(&ip_vs_dropentry))
+ ip_vs_random_dropentry();
+
+ mod_timer(&slow_timer, (jiffies + SLTIMER_PERIOD));
+}
+
+
+void ip_vs_sltimer_init(void)
+{
+ /* initialize the slow timer vectors */
+ init_sltimervecs();
+
+ /* initialize the slow timer jiffies and the vector indexes */
+ sltimer_jiffies = jiffies;
+ sltv1.index = (sltimer_jiffies >> SHIFT_BITS) & TVR_MASK;
+ sltv2.index = (sltimer_jiffies >> (SHIFT_BITS + TVR_BITS)) & TVN_MASK;
+ sltv3.index = (sltimer_jiffies >> (SHIFT_BITS + TVR_BITS + TVN_BITS))
+ & TVN_MASK;
+
+ /* Hook the slow_timer handler in the system timer */
+ init_timer(&slow_timer);
+ slow_timer.function = sltimer_handler;
+ slow_timer.expires = jiffies+SLTIMER_PERIOD;
+ add_timer(&slow_timer);
+}
+
+
+void ip_vs_sltimer_cleanup(void)
+{
+ del_timer_sync(&slow_timer);
+}
--- /dev/null
+/*
+ * IPVS: Weighted Least-Connection Scheduling module
+ *
+ * Version: $Id: ip_vs_wlc.c,v 1.13 2003/04/18 09:03:16 wensong Exp $
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ * Peter Kese <peter.kese@ijs.si>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ * Wensong Zhang : changed the ip_vs_wlc_schedule to return dest
+ * Wensong Zhang : changed to use the inactconns in scheduling
+ * Wensong Zhang : changed some comestics things for debugging
+ * Wensong Zhang : changed for the d-linked destination list
+ * Wensong Zhang : added the ip_vs_wlc_update_svc
+ * Wensong Zhang : added any dest with weight=0 is quiesced
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+
+#include <net/ip_vs.h>
+
+
+static int
+ip_vs_wlc_init_svc(struct ip_vs_service *svc)
+{
+ return 0;
+}
+
+
+static int
+ip_vs_wlc_done_svc(struct ip_vs_service *svc)
+{
+ return 0;
+}
+
+
+static int
+ip_vs_wlc_update_svc(struct ip_vs_service *svc)
+{
+ return 0;
+}
+
+
+static inline unsigned int
+ip_vs_wlc_dest_overhead(struct ip_vs_dest *dest)
+{
+ /*
+ * We think the overhead of processing active connections is 256
+ * times higher than that of inactive connections in average. (This
+ * 256 times might not be accurate, we will change it later) We
+ * use the following formula to estimate the overhead now:
+ * dest->activeconns*256 + dest->inactconns
+ */
+ return (atomic_read(&dest->activeconns) << 8) +
+ atomic_read(&dest->inactconns);
+}
+
+
+/*
+ * Weighted Least Connection scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph)
+{
+ register struct list_head *l, *e;
+ struct ip_vs_dest *dest, *least;
+ unsigned int loh, doh;
+
+ IP_VS_DBG(6, "ip_vs_wlc_schedule(): Scheduling...\n");
+
+ /*
+ * We calculate the load of each dest server as follows:
+ * (dest overhead) / dest->weight
+ *
+ * Remember -- no floats in kernel mode!!!
+ * The comparison of h1*w2 > h2*w1 is equivalent to that of
+ * h1/w1 > h2/w2
+ * if every weight is larger than zero.
+ *
+ * The server with weight=0 is quiesced and will not receive any
+ * new connections.
+ */
+
+ l = &svc->destinations;
+ for (e=l->next; e!=l; e=e->next) {
+ least = list_entry(e, struct ip_vs_dest, n_list);
+ if (!(least->flags & IP_VS_DEST_F_OVERLOAD) &&
+ atomic_read(&least->weight) > 0) {
+ loh = ip_vs_wlc_dest_overhead(least);
+ goto nextstage;
+ }
+ }
+ return NULL;
+
+ /*
+ * Find the destination with the least load.
+ */
+ nextstage:
+ for (e=e->next; e!=l; e=e->next) {
+ dest = list_entry(e, struct ip_vs_dest, n_list);
+
+ if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+ continue;
+ doh = ip_vs_wlc_dest_overhead(dest);
+ if (loh * atomic_read(&dest->weight) >
+ doh * atomic_read(&least->weight)) {
+ least = dest;
+ loh = doh;
+ }
+ }
+
+ IP_VS_DBG(6, "WLC: server %u.%u.%u.%u:%u "
+ "activeconns %d refcnt %d weight %d overhead %d\n",
+ NIPQUAD(least->addr), ntohs(least->port),
+ atomic_read(&least->activeconns),
+ atomic_read(&least->refcnt),
+ atomic_read(&least->weight), loh);
+
+ return least;
+}
+
+
+static struct ip_vs_scheduler ip_vs_wlc_scheduler =
+{
+ .name = "wlc",
+ .refcnt = ATOMIC_INIT(0),
+ .module = THIS_MODULE,
+ .init_service = ip_vs_wlc_init_svc,
+ .done_service = ip_vs_wlc_done_svc,
+ .update_service = ip_vs_wlc_update_svc,
+ .schedule = ip_vs_wlc_schedule,
+};
+
+
+static int __init ip_vs_wlc_init(void)
+{
+ INIT_LIST_HEAD(&ip_vs_wlc_scheduler.n_list);
+ return register_ip_vs_scheduler(&ip_vs_wlc_scheduler);
+}
+
+static void __exit ip_vs_wlc_cleanup(void)
+{
+ unregister_ip_vs_scheduler(&ip_vs_wlc_scheduler);
+}
+
+module_init(ip_vs_wlc_init);
+module_exit(ip_vs_wlc_cleanup);
+MODULE_LICENSE("GPL");
--- /dev/null
+/*
+ * IPVS: Weighted Round-Robin Scheduling module
+ *
+ * Version: $Id: ip_vs_wrr.c,v 1.12 2002/09/15 08:14:08 wensong Exp $
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ * Wensong Zhang : changed the ip_vs_wrr_schedule to return dest
+ * Wensong Zhang : changed some comestics things for debugging
+ * Wensong Zhang : changed for the d-linked destination list
+ * Wensong Zhang : added the ip_vs_wrr_update_svc
+ * Julian Anastasov : fixed the bug of returning destination
+ * with weight 0 when all weights are zero
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+
+#include <net/ip_vs.h>
+
+/*
+ * current destination pointer for weighted round-robin scheduling
+ */
+struct ip_vs_wrr_mark {
+ struct list_head *cl; /* current list head */
+ int cw; /* current weight */
+ int mw; /* maximum weight */
+ int di; /* decreasing interval */
+};
+
+
+/*
+ * Get the gcd of server weights
+ */
+static int gcd(int a, int b)
+{
+ int c;
+
+ while ((c = a % b)) {
+ a = b;
+ b = c;
+ }
+ return b;
+}
+
+static int ip_vs_wrr_gcd_weight(struct ip_vs_service *svc)
+{
+ register struct list_head *l, *e;
+ struct ip_vs_dest *dest;
+ int weight;
+ int g = 1;
+
+ l = &svc->destinations;
+ for (e=l->next; e!=l; e=e->next) {
+ dest = list_entry(e, struct ip_vs_dest, n_list);
+ weight = atomic_read(&dest->weight);
+ if (weight > 0) {
+ g = weight;
+ break;
+ }
+ }
+ if (e == l)
+ return g;
+
+ for (e=e->next; e!=l; e=e->next) {
+ dest = list_entry(e, struct ip_vs_dest, n_list);
+ weight = atomic_read(&dest->weight);
+ if (weight > 0)
+ g = gcd(weight, g);
+ }
+
+ return g;
+}
+
+
+/*
+ * Get the maximum weight of the service destinations.
+ */
+static int ip_vs_wrr_max_weight(struct ip_vs_service *svc)
+{
+ register struct list_head *l, *e;
+ struct ip_vs_dest *dest;
+ int weight = 0;
+
+ l = &svc->destinations;
+ for (e=l->next; e!=l; e=e->next) {
+ dest = list_entry(e, struct ip_vs_dest, n_list);
+ if (atomic_read(&dest->weight) > weight)
+ weight = atomic_read(&dest->weight);
+ }
+
+ return weight;
+}
+
+
+static int ip_vs_wrr_init_svc(struct ip_vs_service *svc)
+{
+ struct ip_vs_wrr_mark *mark;
+
+ /*
+ * Allocate the mark variable for WRR scheduling
+ */
+ mark = kmalloc(sizeof(struct ip_vs_wrr_mark), GFP_ATOMIC);
+ if (mark == NULL) {
+ IP_VS_ERR("ip_vs_wrr_init_svc(): no memory\n");
+ return -ENOMEM;
+ }
+ mark->cl = &svc->destinations;
+ mark->cw = 0;
+ mark->mw = ip_vs_wrr_max_weight(svc);
+ mark->di = ip_vs_wrr_gcd_weight(svc);
+ svc->sched_data = mark;
+
+ return 0;
+}
+
+
+static int ip_vs_wrr_done_svc(struct ip_vs_service *svc)
+{
+ /*
+ * Release the mark variable
+ */
+ kfree(svc->sched_data);
+
+ return 0;
+}
+
+
+static int ip_vs_wrr_update_svc(struct ip_vs_service *svc)
+{
+ struct ip_vs_wrr_mark *mark = svc->sched_data;
+
+ mark->cl = &svc->destinations;
+ mark->mw = ip_vs_wrr_max_weight(svc);
+ mark->di = ip_vs_wrr_gcd_weight(svc);
+ return 0;
+}
+
+
+/*
+ * Weighted Round-Robin Scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_wrr_schedule(struct ip_vs_service *svc, struct iphdr *iph)
+{
+ struct ip_vs_dest *dest;
+ struct ip_vs_wrr_mark *mark = svc->sched_data;
+ struct list_head *p;
+
+ IP_VS_DBG(6, "ip_vs_wrr_schedule(): Scheduling...\n");
+
+ /*
+ * This loop will always terminate, because mark->cw in (0, max_weight]
+ * and at least one server has its weight equal to max_weight.
+ */
+ write_lock(&svc->sched_lock);
+ p = mark->cl;
+ while (1) {
+ if (mark->cl == &svc->destinations) {
+ /* it is at the head of the destination list */
+
+ if (mark->cl == mark->cl->next) {
+ /* no dest entry */
+ dest = NULL;
+ goto out;
+ }
+
+ mark->cl = svc->destinations.next;
+ mark->cw -= mark->di;
+ if (mark->cw <= 0) {
+ mark->cw = mark->mw;
+ /*
+ * Still zero, which means no availabe servers.
+ */
+ if (mark->cw == 0) {
+ mark->cl = &svc->destinations;
+ IP_VS_INFO("ip_vs_wrr_schedule(): "
+ "no available servers\n");
+ dest = NULL;
+ goto out;
+ }
+ }
+ } else
+ mark->cl = mark->cl->next;
+
+ if (mark->cl != &svc->destinations) {
+ /* not at the head of the list */
+ dest = list_entry(mark->cl, struct ip_vs_dest, n_list);
+ if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
+ atomic_read(&dest->weight) >= mark->cw) {
+ /* got it */
+ break;
+ }
+ }
+
+ if (mark->cl == p) {
+ /* back to the start, and no dest is found.
+ It is only possible when all dests are OVERLOADED */
+ dest = NULL;
+ goto out;
+ }
+ }
+
+ IP_VS_DBG(6, "WRR: server %u.%u.%u.%u:%u "
+ "activeconns %d refcnt %d weight %d\n",
+ NIPQUAD(dest->addr), ntohs(dest->port),
+ atomic_read(&dest->activeconns),
+ atomic_read(&dest->refcnt),
+ atomic_read(&dest->weight));
+
+ out:
+ write_unlock(&svc->sched_lock);
+ return dest;
+}
+
+
+static struct ip_vs_scheduler ip_vs_wrr_scheduler = {
+ .name = "wrr",
+ .refcnt = ATOMIC_INIT(0),
+ .module = THIS_MODULE,
+ .init_service = ip_vs_wrr_init_svc,
+ .done_service = ip_vs_wrr_done_svc,
+ .update_service = ip_vs_wrr_update_svc,
+ .schedule = ip_vs_wrr_schedule,
+};
+
+static int __init ip_vs_wrr_init(void)
+{
+ INIT_LIST_HEAD(&ip_vs_wrr_scheduler.n_list);
+ return register_ip_vs_scheduler(&ip_vs_wrr_scheduler) ;
+}
+
+static void __exit ip_vs_wrr_cleanup(void)
+{
+ unregister_ip_vs_scheduler(&ip_vs_wrr_scheduler);
+}
+
+module_init(ip_vs_wrr_init);
+module_exit(ip_vs_wrr_cleanup);
+MODULE_LICENSE("GPL");
--- /dev/null
+/*
+ * ip_vs_xmit.c: various packet transmitters for IPVS
+ *
+ * Version: $Id: ip_vs_xmit.c,v 1.2 2002/11/30 01:50:35 wensong Exp $
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ * Julian Anastasov <ja@ssi.bg>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/compiler.h>
+#include <linux/ip.h>
+#include <linux/tcp.h> /* for tcphdr */
+#include <net/tcp.h> /* for csum_tcpudp_magic */
+#include <net/udp.h>
+#include <net/icmp.h> /* for icmp_send */
+#include <net/route.h> /* for ip_route_output */
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+
+#include <net/ip_vs.h>
+
+
+/*
+ * Destination cache to speed up outgoing route lookup
+ */
+static inline void
+__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst)
+{
+ struct dst_entry *old_dst;
+
+ old_dst = dest->dst_cache;
+ dest->dst_cache = dst;
+ dest->dst_rtos = rtos;
+ dst_release(old_dst);
+}
+
+static inline struct dst_entry *
+__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie)
+{
+ struct dst_entry *dst = dest->dst_cache;
+
+ if (!dst)
+ return NULL;
+ if ((dst->obsolete || rtos != dest->dst_rtos) &&
+ dst->ops->check(dst, cookie) == NULL) {
+ dest->dst_cache = 0;
+ return NULL;
+ }
+ dst_hold(dst);
+ return dst;
+}
+
+static inline struct rtable *
+__ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos)
+{
+ struct rtable *rt; /* Route to the other host */
+ struct ip_vs_dest *dest = cp->dest;
+
+ if (dest) {
+ spin_lock(&dest->dst_lock);
+ if (!(rt = (struct rtable *)
+ __ip_vs_dst_check(dest, rtos, 0))) {
+ struct flowi fl = {
+ .oif = 0,
+ .nl_u = {
+ .ip4_u = {
+ .daddr = dest->addr,
+ .saddr = 0,
+ .tos = rtos, } },
+ .proto = cp->protocol,
+ };
+
+ if (ip_route_output_key(&rt, &fl)) {
+ spin_unlock(&dest->dst_lock);
+ IP_VS_DBG_RL("ip_route_output error, "
+ "dest: %u.%u.%u.%u\n",
+ NIPQUAD(dest->addr));
+ return NULL;
+ }
+ __ip_vs_dst_set(dest, rtos, dst_clone(&rt->u.dst));
+ IP_VS_DBG(10, "new dst %u.%u.%u.%u, refcnt=%d, rtos=%X\n",
+ NIPQUAD(dest->addr),
+ atomic_read(&rt->u.dst.__refcnt), rtos);
+ }
+ spin_unlock(&dest->dst_lock);
+ } else {
+ struct flowi fl = {
+ .oif = 0,
+ .nl_u = {
+ .ip4_u = {
+ .daddr = dest->addr,
+ .saddr = 0,
+ .tos = rtos, } },
+ .proto = cp->protocol,
+ };
+
+ if (ip_route_output_key(&rt, &fl)) {
+ IP_VS_DBG_RL("ip_route_output error, dest: "
+ "%u.%u.%u.%u\n", NIPQUAD(cp->daddr));
+ return NULL;
+ }
+ }
+
+ return rt;
+}
+
+
+/*
+ * Release dest->dst_cache before a dest is removed
+ */
+void
+ip_vs_dst_reset(struct ip_vs_dest *dest)
+{
+ struct dst_entry *old_dst;
+
+ old_dst = dest->dst_cache;
+ dest->dst_cache = NULL;
+ dst_release(old_dst);
+}
+
+
+static inline int
+ip_vs_skb_cow(struct sk_buff *skb, unsigned int headroom,
+ struct iphdr **iph_p, unsigned char **t_p)
+{
+ int delta = (headroom > 16 ? headroom : 16) - skb_headroom(skb);
+
+ if (delta < 0)
+ delta = 0;
+
+ if (delta ||skb_cloned(skb)) {
+ if (pskb_expand_head(skb, (delta+15)&~15, 0, GFP_ATOMIC))
+ return -ENOMEM;
+
+ /* skb data changed, update pointers */
+ *iph_p = skb->nh.iph;
+ *t_p = (char*) (*iph_p) + (*iph_p)->ihl * 4;
+ }
+ return 0;
+}
+
+
+#define IP_VS_XMIT(skb, rt) \
+do { \
+ skb->nfcache |= NFC_IPVS_PROPERTY; \
+ NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, \
+ rt->u.dst.dev, dst_output); \
+} while (0)
+
+
+/*
+ * NULL transmitter (do nothing except return NF_ACCEPT)
+ */
+int
+ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
+ struct ip_vs_protocol *pp)
+{
+ return NF_ACCEPT;
+}
+
+
+/*
+ * Bypass transmitter
+ * Let packets bypass the destination when the destination is not
+ * available, it may be only used in transparent cache cluster.
+ */
+int
+ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
+ struct ip_vs_protocol *pp)
+{
+ struct rtable *rt; /* Route to the other host */
+ struct iphdr *iph = skb->nh.iph;
+ u8 tos = iph->tos;
+ int mtu;
+ struct flowi fl = {
+ .oif = 0,
+ .nl_u = {
+ .ip4_u = {
+ .daddr = iph->daddr,
+ .saddr = 0,
+ .tos = RT_TOS(tos), } },
+ .proto = iph->protocol,
+ };
+
+ EnterFunction(10);
+
+ if (ip_route_output_key(&rt, &fl)) {
+ IP_VS_DBG_RL("ip_vs_bypass_xmit(): ip_route_output error, "
+ "dest: %u.%u.%u.%u\n", NIPQUAD(iph->daddr));
+ goto tx_error_icmp;
+ }
+
+ /* MTU checking */
+ mtu = dst_pmtu(&rt->u.dst);
+ if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) {
+ ip_rt_put(rt);
+ icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
+ IP_VS_DBG_RL("ip_vs_bypass_xmit(): frag needed\n");
+ goto tx_error;
+ }
+
+ if (skb_is_nonlinear(skb) && skb->len <= mtu)
+ ip_send_check(iph);
+
+ if (unlikely(skb_headroom(skb) < rt->u.dst.dev->hard_header_len)) {
+ if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) {
+ ip_rt_put(rt);
+ IP_VS_ERR_RL("ip_vs_bypass_xmit(): no memory\n");
+ goto tx_error;
+ }
+ }
+
+ /* drop old route */
+ dst_release(skb->dst);
+ skb->dst = &rt->u.dst;
+
+#ifdef CONFIG_NETFILTER_DEBUG
+ skb->nf_debug = 0;
+#endif /* CONFIG_NETFILTER_DEBUG */
+ IP_VS_XMIT(skb, rt);
+
+ LeaveFunction(10);
+ return NF_STOLEN;
+
+ tx_error_icmp:
+ dst_link_failure(skb);
+ tx_error:
+ kfree_skb(skb);
+ return NF_STOLEN;
+}
+
+
+/*
+ * NAT transmitter (only for outside-to-inside nat forwarding)
+ * Not used for related ICMP
+ */
+int
+ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
+ struct ip_vs_protocol *pp)
+{
+ struct rtable *rt; /* Route to the other host */
+ struct iphdr *iph;
+ union ip_vs_tphdr h;
+ int ihl;
+ unsigned short size;
+ int mtu;
+
+ EnterFunction(10);
+
+ /*
+ * If it has ip_vs_app helper, the helper may change the payload,
+ * so it needs full checksum checking and checksum calculation.
+ * If not, only the header (such as IP address and port number)
+ * will be changed, so it is fast to do incremental checksum update,
+ * and let the destination host do final checksum checking.
+ */
+
+ if (unlikely(cp->app && !pp->slave)) {
+ if (skb_is_nonlinear(skb) &&
+ skb_linearize(skb, GFP_ATOMIC) != 0)
+ return NF_DROP;
+ }
+
+ iph = skb->nh.iph;
+ ihl = iph->ihl << 2;
+ h.raw = (char*) iph + ihl;
+ size = ntohs(iph->tot_len) - ihl;
+
+ /* do TCP/UDP checksum checking if it has application helper */
+ if (unlikely(cp->app && pp->csum_check && !pp->slave)) {
+ if (!pp->csum_check(skb, pp, iph, h, size))
+ goto tx_error;
+ }
+
+ /*
+ * Check if it is no clinet port connection ...
+ */
+ if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
+ ip_vs_conn_fill_cport(cp, h.portp[0]);
+ IP_VS_DBG(10, "filled cport=%d\n", ntohs(cp->dport));
+ }
+
+ if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
+ goto tx_error_icmp;
+
+ /* MTU checking */
+ mtu = dst_pmtu(&rt->u.dst);
+ if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) {
+ ip_rt_put(rt);
+ icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
+ IP_VS_DBG_RL_PKT(0, pp, iph, "ip_vs_nat_xmit(): frag needed for");
+ goto tx_error;
+ }
+
+ /* drop old route */
+ dst_release(skb->dst);
+ skb->dst = &rt->u.dst;
+
+ /* copy-on-write the packet before mangling it */
+ if (ip_vs_skb_cow(skb, rt->u.dst.dev->hard_header_len, &iph, &h.raw))
+ return NF_DROP;
+
+ /* mangle the packet */
+ iph->daddr = cp->daddr;
+ if (pp->dnat_handler) {
+ pp->dnat_handler(skb, pp, cp, iph, h, size);
+ iph = skb->nh.iph;
+ h.raw = (char*) iph + ihl;
+ }
+ ip_send_check(iph);
+
+ IP_VS_DBG_PKT(10, pp, iph, "After DNAT");
+
+ /* FIXME: when application helper enlarges the packet and the length
+ is larger than the MTU of outgoing device, there will be still
+ MTU problem. */
+
+#ifdef CONFIG_NETFILTER_DEBUG
+ skb->nf_debug = 0;
+#endif /* CONFIG_NETFILTER_DEBUG */
+ IP_VS_XMIT(skb, rt);
+
+ LeaveFunction(10);
+ return NF_STOLEN;
+
+ tx_error_icmp:
+ dst_link_failure(skb);
+ tx_error:
+ kfree_skb(skb);
+ return NF_STOLEN;
+}
+
+
+/*
+ * IP Tunneling transmitter
+ *
+ * This function encapsulates the packet in a new IP packet, its
+ * destination will be set to cp->daddr. Most code of this function
+ * is taken from ipip.c.
+ *
+ * It is used in VS/TUN cluster. The load balancer selects a real
+ * server from a cluster based on a scheduling algorithm,
+ * encapsulates the request packet and forwards it to the selected
+ * server. For example, all real servers are configured with
+ * "ifconfig tunl0 <Virtual IP Address> up". When the server receives
+ * the encapsulated packet, it will decapsulate the packet, processe
+ * the request and return the response packets directly to the client
+ * without passing the load balancer. This can greatly increase the
+ * scalability of virtual server.
+ *
+ * Used for ANY protocol
+ */
+int
+ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
+ struct ip_vs_protocol *pp)
+{
+ struct rtable *rt; /* Route to the other host */
+ struct net_device *tdev; /* Device to other host */
+ struct iphdr *old_iph = skb->nh.iph;
+ u8 tos = old_iph->tos;
+ u16 df = old_iph->frag_off;
+ struct iphdr *iph; /* Our new IP header */
+ int max_headroom; /* The extra header space needed */
+ int mtu;
+
+ EnterFunction(10);
+
+ if (skb->protocol != __constant_htons(ETH_P_IP)) {
+ IP_VS_DBG_RL("ip_vs_tunnel_xmit(): protocol error, "
+ "ETH_P_IP: %d, skb protocol: %d\n",
+ __constant_htons(ETH_P_IP), skb->protocol);
+ goto tx_error;
+ }
+
+ if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(tos))))
+ goto tx_error_icmp;
+
+ tdev = rt->u.dst.dev;
+
+ mtu = dst_pmtu(&rt->u.dst) - sizeof(struct iphdr);
+ if (mtu < 68) {
+ ip_rt_put(rt);
+ IP_VS_DBG_RL("ip_vs_tunnel_xmit(): mtu less than 68\n");
+ goto tx_error;
+ }
+ if (skb->dst)
+ skb->dst->ops->update_pmtu(skb->dst, mtu);
+
+ df |= (old_iph->frag_off&__constant_htons(IP_DF));
+
+ if ((old_iph->frag_off&__constant_htons(IP_DF))
+ && mtu < ntohs(old_iph->tot_len)) {
+ icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
+ ip_rt_put(rt);
+ IP_VS_DBG_RL("ip_vs_tunnel_xmit(): frag needed\n");
+ goto tx_error;
+ }
+
+ if (skb_is_nonlinear(skb))
+ ip_send_check(old_iph);
+
+ skb->h.raw = skb->nh.raw;
+
+ /*
+ * Okay, now see if we can stuff it in the buffer as-is.
+ */
+ max_headroom = (((tdev->hard_header_len+15)&~15)+sizeof(struct iphdr));
+
+ if (skb_headroom(skb) < max_headroom
+ || skb_cloned(skb) || skb_shared(skb)) {
+ struct sk_buff *new_skb =
+ skb_realloc_headroom(skb, max_headroom);
+ if (!new_skb) {
+ ip_rt_put(rt);
+ kfree_skb(skb);
+ IP_VS_ERR_RL("ip_vs_tunnel_xmit(): no memory\n");
+ return -EINVAL;
+ }
+ kfree_skb(skb);
+ skb = new_skb;
+ }
+
+ skb->nh.raw = skb_push(skb, sizeof(struct iphdr));
+ memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+
+ /* drop old route */
+ dst_release(skb->dst);
+ skb->dst = &rt->u.dst;
+
+ /*
+ * Push down and install the IPIP header.
+ */
+ iph = skb->nh.iph;
+ iph->version = 4;
+ iph->ihl = sizeof(struct iphdr)>>2;
+ iph->frag_off = df;
+ iph->protocol = IPPROTO_IPIP;
+ iph->tos = tos;
+ iph->daddr = rt->rt_dst;
+ iph->saddr = rt->rt_src;
+ iph->ttl = old_iph->ttl;
+ iph->tot_len = htons(skb->len);
+ ip_select_ident(iph, &rt->u.dst, NULL);
+ ip_send_check(iph);
+
+ skb->ip_summed = CHECKSUM_NONE;
+#ifdef CONFIG_NETFILTER_DEBUG
+ skb->nf_debug = 0;
+#endif /* CONFIG_NETFILTER_DEBUG */
+ IP_VS_XMIT(skb, rt);
+
+ LeaveFunction(10);
+
+ return NF_STOLEN;
+
+ tx_error_icmp:
+ dst_link_failure(skb);
+ tx_error:
+ kfree_skb(skb);
+ return NF_STOLEN;
+}
+
+
+/*
+ * Direct Routing transmitter
+ * Used for ANY protocol
+ */
+int
+ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
+ struct ip_vs_protocol *pp)
+{
+ struct rtable *rt; /* Route to the other host */
+ struct iphdr *iph = skb->nh.iph;
+ int mtu;
+
+ EnterFunction(10);
+
+ if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
+ goto tx_error_icmp;
+
+ /* MTU checking */
+ mtu = dst_pmtu(&rt->u.dst);
+ if ((iph->frag_off&__constant_htons(IP_DF)) && skb->len > mtu) {
+ icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
+ ip_rt_put(rt);
+ IP_VS_DBG_RL("ip_vs_dr_xmit(): frag needed\n");
+ goto tx_error;
+ }
+
+ if (skb_is_nonlinear(skb) && skb->len <= mtu)
+ ip_send_check(iph);
+
+ if (unlikely(skb_headroom(skb) < rt->u.dst.dev->hard_header_len)) {
+ if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) {
+ ip_rt_put(rt);
+ IP_VS_ERR_RL("ip_vs_dr_xmit(): no memory\n");
+ goto tx_error;
+ }
+ }
+
+ /* drop old route */
+ dst_release(skb->dst);
+ skb->dst = &rt->u.dst;
+
+#ifdef CONFIG_NETFILTER_DEBUG
+ skb->nf_debug = 0;
+#endif /* CONFIG_NETFILTER_DEBUG */
+ IP_VS_XMIT(skb, rt);
+
+ LeaveFunction(10);
+ return NF_STOLEN;
+
+ tx_error_icmp:
+ dst_link_failure(skb);
+ tx_error:
+ kfree_skb(skb);
+ return NF_STOLEN;
+}
+
+
+/*
+ * ICMP packet transmitter
+ * called by the ip_vs_in_icmp
+ */
+int
+ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
+ struct ip_vs_protocol *pp)
+{
+ struct rtable *rt; /* Route to the other host */
+ struct iphdr *iph;
+ struct icmphdr *icmph;
+ struct iphdr *ciph; /* The ip header contained within the ICMP */
+ unsigned short len;
+ union ip_vs_tphdr h;
+ int mtu;
+ int rc;
+
+ EnterFunction(10);
+
+ /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
+ forwarded directly here, because there is no need to
+ translate address/port back */
+ if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
+ if (cp->packet_xmit)
+ rc = cp->packet_xmit(skb, cp, pp);
+ else
+ rc = NF_ACCEPT;
+ atomic_inc(&cp->in_pkts);
+ __ip_vs_conn_put(cp);
+ goto out;
+ }
+
+ iph = skb->nh.iph;
+ icmph = (struct icmphdr *)((char *)iph+(iph->ihl<<2));
+ len = ntohs(iph->tot_len) - (iph->ihl<<2);
+
+ /*
+ * mangle and send the packet here (only for VS/NAT)
+ */
+
+ if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
+ goto tx_error_icmp;
+
+ /* MTU checking */
+ mtu = dst_pmtu(&rt->u.dst);
+ if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) {
+ ip_rt_put(rt);
+ icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
+ IP_VS_DBG_RL("ip_vs_in_icmp(): frag needed\n");
+ goto tx_error;
+ }
+
+ /* drop old route */
+ dst_release(skb->dst);
+ skb->dst = &rt->u.dst;
+
+ /* copy-on-write the packet before mangling it */
+ if (ip_vs_skb_cow(skb, rt->u.dst.dev->hard_header_len,
+ &iph, (unsigned char**)&icmph)) {
+ rc = NF_DROP;
+ goto out;
+ }
+ ciph = (struct iphdr *) (icmph + 1);
+ h.raw = (char *) ciph + (ciph->ihl << 2);
+
+ /* The ICMP packet for VS/NAT must be written to correct addresses
+ before being forwarded to the right server */
+
+ /* First change the dest IP address, and recalc checksum */
+ iph->daddr = cp->daddr;
+ ip_send_check(iph);
+
+ /* Now change the *source* address in the contained IP */
+ ciph->saddr = cp->daddr;
+ ip_send_check(ciph);
+
+ /* the TCP/UDP source port - cannot redo check */
+ if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol)
+ h.portp[0] = cp->dport;
+
+ /* And finally the ICMP checksum */
+ icmph->checksum = 0;
+ icmph->checksum = ip_compute_csum((unsigned char *) icmph, len);
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+ IP_VS_DBG_PKT(11, pp, ciph, "Forwarding incoming ICMP");
+
+#ifdef CONFIG_NETFILTER_DEBUG
+ skb->nf_debug = 0;
+#endif /* CONFIG_NETFILTER_DEBUG */
+ IP_VS_XMIT(skb, rt);
+
+ rc = NF_STOLEN;
+ goto out;
+
+ tx_error_icmp:
+ dst_link_failure(skb);
+ tx_error:
+ dev_kfree_skb(skb);
+ rc = NF_STOLEN;
+ out:
+ LeaveFunction(10);
+ return rc;
+}
static struct firewall_ops *fwops;
+#ifdef CONFIG_IP_VS
+/* From ip_vs_core.c */
+extern unsigned int
+check_for_ip_vs_out(struct sk_buff **skb_p, int (*okfn)(struct sk_buff *));
+#endif
+
/* They call these; we do what they want. */
int register_firewall(int pf, struct firewall_ops *fw)
{
return NF_ACCEPT;
case FW_MASQUERADE:
- if (hooknum == NF_IP_FORWARD)
+ if (hooknum == NF_IP_FORWARD) {
+#ifdef CONFIG_IP_VS
+ /* check if it is for ip_vs */
+ if (check_for_ip_vs_out(pskb, okfn) == NF_STOLEN)
+ return NF_STOLEN;
+#endif
return do_masquerade(pskb, out);
+ }
else return NF_ACCEPT;
case FW_REDIRECT:
EXPORT_SYMBOL(in_aton);
EXPORT_SYMBOL(ip_mc_inc_group);
EXPORT_SYMBOL(ip_mc_dec_group);
+EXPORT_SYMBOL(ip_mc_join_group);
EXPORT_SYMBOL(ip_finish_output);
EXPORT_SYMBOL(inet_stream_ops);
EXPORT_SYMBOL(inet_dgram_ops);