[lxc-devel] limit the netwok traffic of container from the host

lsmushroom lsmushroom at 126.com
Tue Jun 4 12:50:49 UTC 2013


Hi All,
      Recently , we have been trying to find out a suitable way to limit  network traffic generated from the process running in the container. The network  type we used for our container is veth. And we have tried TC combined with cgroup net_cls subsystem , which has successfully fulfill our goal . However ,  it requires to add the configurations inside the container. As we will provide the container as a service, and it is obviously unacceptable to allow the end user modify the bandwidth allocation . 
 
 Thus , we add a new option to tc command named as "peer", which is an optional option , followed keyword dev , and we made a small modification of the kernel  to support this option. 
 
 Below is an example of how to use it :
 
 related hardware infomation, two ends of the veth device , which is used by the container:
 
 lxc.network.name = eth0
 lxc.network.veth.pair = veth-vps1


 Add configuration:
 tc qdisc add dev peer veth-vps1 root handle 1:0 htb default 4   
 tc class add dev  peer veth-vps1 parent 1: classid 1:2 htb rate 3mbit    
 tc class add dev  peer veth-vps1 parent 1: classid 1:3 htb rate 2mbit
 tc class add dev  peer veth-vps1 parent 1: classid 1:4 htb rate 1000mbit
 tc filter add dev  peer veth-vps1 protocol ip parent 1:0 prio 1 handle 1: cgroup
 
 Display configuration:
 tc -s -d qdisc show dev peer veth-vps1 
 tc -s -d class show dev peer veth-vps1 
 tc filter show dev peer veth-vps1
   
 Delete configuration:
 tc qdisc del dev peer veth-vps1 root
 tc filter del  dev peer veth-vps1 parent 1: prio 2 cgroup
 tc class del dev peer veth-vps1 parent 1: classid 1:3 htb rate 2mbit
 
 All of the configurations list above will not operate on veth-vps1 , but on the peer end .


 In this way , we may modify the configures of the other end of the veth type device , no need to know it's device name and in which namespace it is in. Thus  we may limit the network traffic of the container from the host, no need to get access to the container.
 
 The modification is based on RHEL6.3 , kernel version: 2.6.32-279.22.1.el6 , iproute version: iproute2-2.6.32
  Hope it could help you and any suggestions is welcome.


related patch:
diff --git a/tc/tc_class.c b/tc/tc_class.c
index 9d4eea5..c894bf2 100644
--- a/tc/tc_class.c
+++ b/tc/tc_class.c
@@ -51,6 +51,7 @@ int tc_class_modify(int cmd, unsigned flags, int argc, char **argv)
 struct tc_estimator est;
 char  d[16];
 char  k[16];
+int  flag = 0;
 
 memset(&req, 0, sizeof(req));
 memset(&est, 0, sizeof(est));
@@ -67,6 +68,10 @@ int tc_class_modify(int cmd, unsigned flags, int argc, char **argv)
 NEXT_ARG();
 if (d[0])
 duparg("dev", *argv);
+if(strcmp(*argv , "peer") == 0){
+flag = 1;
+NEXT_ARG();
+}
 strncpy(d, *argv, sizeof(d)-1);
 } else if (strcmp(*argv, "classid") == 0) {
 __u32 handle;
@@ -136,6 +141,9 @@ int tc_class_modify(int cmd, unsigned flags, int argc, char **argv)
 fprintf(stderr, "Cannot find device \"%s\"\n", d);
 return 1;
 }
+if(flag)
+req.t.tcm_ifindex |= 0x80000000;
+
 }
 
 if (rtnl_talk(&rth, &req.n, 0, 0, NULL, NULL, NULL) < 0)
@@ -236,6 +244,7 @@ int tc_class_list(int argc, char **argv)
 {
 struct tcmsg t;
 char d[16];
+int flag = 0;
 
 memset(&t, 0, sizeof(t));
 t.tcm_family = AF_UNSPEC;
@@ -246,6 +255,10 @@ int tc_class_list(int argc, char **argv)
 NEXT_ARG();
 if (d[0])
 duparg("dev", *argv);
+if(strcmp(*argv , "peer") == 0){
+flag = 1;
+NEXT_ARG();
+}
 strncpy(d, *argv, sizeof(d)-1);
 } else if (strcmp(*argv, "qdisc") == 0) {
 NEXT_ARG();
@@ -291,6 +304,9 @@ int tc_class_list(int argc, char **argv)
 return 1;
 }
 filter_ifindex = t.tcm_ifindex;
+
+if(flag)
+t.tcm_ifindex |= 0x80000000;
 }
 
  if (rtnl_dump_request(&rth, RTM_GETTCLASS, &t, sizeof(t)) < 0) {
diff --git a/tc/tc_filter.c b/tc/tc_filter.c
index 919c57c..bec5b1a 100644
--- a/tc/tc_filter.c
+++ b/tc/tc_filter.c
@@ -60,6 +60,7 @@ int tc_filter_modify(int cmd, unsigned flags, int argc, char **argv)
 char  d[16];
 char  k[16];
 struct tc_estimator est;
+int flag = 0;
 
 memset(&req, 0, sizeof(req));
 memset(&est, 0, sizeof(est));
@@ -80,6 +81,10 @@ int tc_filter_modify(int cmd, unsigned flags, int argc, char **argv)
 NEXT_ARG();
 if (d[0])
 duparg("dev", *argv);
+if(strcmp(*argv , "peer") == 0){
+flag = 1;
+NEXT_ARG();
+}
 strncpy(d, *argv, sizeof(d)-1);
 } else if (strcmp(*argv, "root") == 0) {
 if (req.t.tcm_parent) {
@@ -165,6 +170,9 @@ int tc_filter_modify(int cmd, unsigned flags, int argc, char **argv)
 fprintf(stderr, "Cannot find device \"%s\"\n", d);
 return 1;
 }
+if(flag)
+req.t.tcm_ifindex |= 0x80000000;
+
 }
 
  if (rtnl_talk(&rth, &req.n, 0, 0, NULL, NULL, NULL) < 0) {
@@ -267,6 +275,7 @@ int tc_filter_list(int argc, char **argv)
 __u32 prio = 0;
 __u32 protocol = 0;
 char *fhandle = NULL;
+int flag = 0;
 
 memset(&t, 0, sizeof(t));
 t.tcm_family = AF_UNSPEC;
@@ -277,6 +286,10 @@ int tc_filter_list(int argc, char **argv)
 NEXT_ARG();
 if (d[0])
 duparg("dev", *argv);
+if(strcmp(*argv , "peer") == 0){
+flag = 1;
+NEXT_ARG();
+}
 strncpy(d, *argv, sizeof(d)-1);
 } else if (strcmp(*argv, "root") == 0) {
 if (t.tcm_parent) {
@@ -334,6 +347,9 @@ int tc_filter_list(int argc, char **argv)
 return 1;
 }
 filter_ifindex = t.tcm_ifindex;
+
+if(flag)
+t.tcm_ifindex |= 0x80000000;
 }
 
  if (rtnl_dump_request(&rth, RTM_GETTFILTER, &t, sizeof(t)) < 0) {
diff --git a/tc/tc_qdisc.c b/tc/tc_qdisc.c
index c7f2988..3ee7bf6 100644
--- a/tc/tc_qdisc.c
+++ b/tc/tc_qdisc.c
@@ -59,6 +59,7 @@ int tc_qdisc_modify(int cmd, unsigned flags, int argc, char **argv)
 struct tcmsg t;
 char   buf[TCA_BUF_MAX];
 } req;
+int flag = 0;
 
 memset(&req, 0, sizeof(req));
 memset(&stab, 0, sizeof(stab));
@@ -76,6 +77,10 @@ int tc_qdisc_modify(int cmd, unsigned flags, int argc, char **argv)
 NEXT_ARG();
 if (d[0])
 duparg("dev", *argv);
+if(strcmp(*argv , "peer") == 0){
+flag = 1;
+NEXT_ARG();
+}
 strncpy(d, *argv, sizeof(d)-1);
 } else if (strcmp(*argv, "handle") == 0) {
 __u32 handle;
@@ -184,6 +189,9 @@ int tc_qdisc_modify(int cmd, unsigned flags, int argc, char **argv)
 return 1;
 }
 req.t.tcm_ifindex = idx;
+
+if(flag)
+req.t.tcm_ifindex |= 0x80000000;
 }
 
  if (rtnl_talk(&rth, &req.n, 0, 0, NULL, NULL, NULL) < 0)
@@ -281,6 +289,7 @@ int tc_qdisc_list(int argc, char **argv)
 {
 struct tcmsg t;
 char d[16];
+int flag = 0;
 
 memset(&t, 0, sizeof(t));
 t.tcm_family = AF_UNSPEC;
@@ -289,6 +298,10 @@ int tc_qdisc_list(int argc, char **argv)
 while (argc > 0) {
 if (strcmp(*argv, "dev") == 0) {
 NEXT_ARG();
+if(strcmp(*argv , "peer") == 0){
+flag = 1;
+NEXT_ARG();
+}
 strncpy(d, *argv, sizeof(d)-1);
 #ifdef TC_H_INGRESS
                 } else if (strcmp(*argv, "ingress") == 0) {
@@ -315,7 +328,12 @@ int tc_qdisc_list(int argc, char **argv)
 fprintf(stderr, "Cannot find device \"%s\"\n", d);
 return 1;
 }
+
+if(flag)
+t.tcm_ifindex |= 0x80000000;
+
 filter_ifindex = t.tcm_ifindex;
+
 }
 
  if (rtnl_dump_request(&rth, RTM_GETQDISC, &t, sizeof(t)) < 0) {




diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h
index f911ec7..48f21e5 100644
--- a/include/net/pkt_sched.h
+++ b/include/net/pkt_sched.h
@@ -5,6 +5,9 @@
 #include <linux/ktime.h>
 #include <net/sch_generic.h>
 
+#define IDX_MASK 0x7FFFFFFFul
+#define FLG_MASK ~IDX_MASK
+
 struct qdisc_walker
 {
 intstop;
@@ -112,4 +115,6 @@ static inline unsigned psched_mtu(const struct net_device *dev)
 return dev->mtu + dev->hard_header_len;
 }
 
+int inline get_peer_dev(struct net_device** dev);
+
 #endif
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 7cf6c0f..19909bf 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -136,6 +136,7 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
 unsigned long fh;
 int err;
 int tp_created = 0;
+int flag = 0;
 
 if (net != &init_net)
 return -EINVAL;
@@ -158,10 +159,21 @@ replay:
 /* Find head of filter chain. */
 
 /* Find link */
+if(t->tcm_ifindex & FLG_MASK){
+flag = 1;
+t->tcm_ifindex &= IDX_MASK;
+}
+
 dev = __dev_get_by_index(&init_net, t->tcm_ifindex);
 if (dev == NULL)
 return -ENODEV;
 
+if(flag) {
+err = get_peer_dev(&dev);
+if(err < 0)
+return err;
+}
+
 err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, NULL);
 if (err < 0)
 return err;
@@ -416,15 +428,29 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
 unsigned long cl = 0;
 const struct Qdisc_class_ops *cops;
 struct tcf_dump_args arg;
+int flag = 0;
+int err = 0;
 
 if (net != &init_net)
 return 0;
 
 if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
 return skb->len;
-if ((dev = dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
+
+if(tcm->tcm_ifindex & FLG_MASK){
+flag = 1;
+tcm->tcm_ifindex &= IDX_MASK;
+}
+
+if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
 return skb->len;
 
+if(flag) {
+err = get_peer_dev(&dev);
+if(err < 0)
+return err;
+}
+
 if (!tcm->tcm_parent)
 q = dev->qdisc;
 else
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 6cc404e..c5d663a 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -948,6 +948,23 @@ check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
 return 0;
 }
 
+int inline get_peer_dev(struct net_device** dev)
+{
+struct net_device* end = *dev;
+int err = -EINVAL;
+
+if(!end)
+return err;
+
+if(end->rtnl_link_ops && !strcmp(end->rtnl_link_ops->kind , "veth")){
+*dev = *(struct net_device**)netdev_priv(end);
+err = 0;
+}
+
+return err;
+}
+EXPORT_SYMBOL(get_peer_dev);
+
 /*
  * Delete/get qdisc.
  */
@@ -962,13 +979,25 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
 struct Qdisc *q = NULL;
 struct Qdisc *p = NULL;
 int err;
+int flag = 0;
 
 if (net != &init_net)
 return -EINVAL;
+
+if(tcm->tcm_ifindex & FLG_MASK){
+flag = 1;
+tcm->tcm_ifindex &= IDX_MASK;
+}
 
 if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
 return -ENODEV;
 
+if(flag) {
+err = get_peer_dev(&dev);
+if(err < 0)
+return err;
+}
+
 err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
 if (err < 0)
 return err;
@@ -1024,6 +1053,7 @@ static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
 u32 clid;
 struct Qdisc *q, *p;
 int err;
+int flag = 0;
 
 if (net != &init_net)
 return -EINVAL;
@@ -1034,9 +1064,20 @@ replay:
 clid = tcm->tcm_parent;
 q = p = NULL;
 
+if(tcm->tcm_ifindex & FLG_MASK){
+flag = 1;
+tcm->tcm_ifindex &= IDX_MASK;
+}
+
 if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
 return -ENODEV;
 
+if(flag) {
+err = get_peer_dev(&dev);
+if(err < 0)
+return err;
+}
+
 err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
 if (err < 0)
 return err;
@@ -1169,6 +1210,10 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
 struct nlmsghdr  *nlh;
 unsigned char *b = skb_tail_pointer(skb);
 struct gnet_dump d;
+struct net *skb_net;
+struct net *dev_net;
+struct net_device *dev;
+int err;
 
 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
 tcm = NLMSG_DATA(nlh);
@@ -1184,6 +1229,20 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
 goto nla_put_failure;
 q->qstats.qlen = q->q.qlen;
 
+if(skb->sk){
+skb_net = sock_net(skb->sk);
+dev_net = qdisc_dev(q)->nd_net;
+
+if( skb_net != dev_net ){
+dev = qdisc_dev(q);
+err = get_peer_dev(&dev);
+if(!err){
+tcm->tcm_ifindex = dev->ifindex;
+tcm->tcm_ifindex |= FLG_MASK;
+}
+}
+}
+
 if (q->stab && qdisc_dump_stab(skb, q->stab) < 0)
 goto nla_put_failure;
 
@@ -1285,19 +1344,38 @@ done:
 
 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
 {
+struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh);
 struct net *net = sock_net(skb->sk);
 int idx, q_idx;
 int s_idx, s_q_idx;
 struct net_device *dev;
-
+int flag = 0;
+int err = 0;
+
 if (net != &init_net)
 return 0;
 
+if(tcm->tcm_ifindex & FLG_MASK){
+flag = 1;
+tcm->tcm_ifindex &= IDX_MASK;
+}
+
+if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
+return -ENODEV;
+
+if(flag) {
+err = get_peer_dev(&dev);
+if(err < 0)
+return err;
+}
+
+net = dev->nd_net;
+
 s_idx = cb->args[0];
 s_q_idx = q_idx = cb->args[1];
 read_lock(&dev_base_lock);
 idx = 0;
-for_each_netdev(&init_net, dev) {
+for_each_netdev(net, dev) {
 struct netdev_queue *dev_queue;
 
 if (idx < s_idx)
@@ -1348,13 +1426,25 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
 u32 clid = tcm->tcm_handle;
 u32 qid = TC_H_MAJ(clid);
 int err;
+int flag = 0;
 
 if (net != &init_net)
 return -EINVAL;
 
+if(tcm->tcm_ifindex & FLG_MASK){
+flag = 1;
+tcm->tcm_ifindex &= IDX_MASK;
+}
+
 if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
 return -ENODEV;
 
+if(flag) {
+err = get_peer_dev(&dev);
+if(err < 0)
+return err;
+}
+
 err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
 if (err < 0)
 return err;
@@ -1590,14 +1680,28 @@ static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
 struct netdev_queue *dev_queue;
 struct net_device *dev;
 int t, s_t;
+int flag = 0;
+int err = 0;
 
 if (net != &init_net)
 return 0;
 
 if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
 return 0;
-if ((dev = dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
-return 0;
+
+if(tcm->tcm_ifindex & FLG_MASK){
+flag = 1;
+tcm->tcm_ifindex &= IDX_MASK;
+}
+
+if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
+return -ENODEV;
+
+if(flag) {
+err = get_peer_dev(&dev);
+if(err < 0)
+return err;
+}
 
 s_t = cb->args[0];
 t = 0;


 

-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.linuxcontainers.org/pipermail/lxc-devel/attachments/20130604/56a3f737/attachment.html>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: tc-peer-iproute2-2.6.32.patch
Type: application/octet-stream
Size: 5044 bytes
Desc: not available
URL: <http://lists.linuxcontainers.org/pipermail/lxc-devel/attachments/20130604/56a3f737/attachment.obj>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: tc-peer-kernel-2.6.32-279.22.1.el6.patch
Type: application/octet-stream
Size: 6792 bytes
Desc: not available
URL: <http://lists.linuxcontainers.org/pipermail/lxc-devel/attachments/20130604/56a3f737/attachment-0001.obj>


More information about the lxc-devel mailing list