@@ -136,7 +136,7 @@ static void ip_expire(struct timer_list *t)
136136{
137137 struct inet_frag_queue * frag = from_timer (frag , t , timer );
138138 const struct iphdr * iph ;
139- struct sk_buff * head ;
139+ struct sk_buff * head = NULL ;
140140 struct net * net ;
141141 struct ipq * qp ;
142142 int err ;
@@ -152,14 +152,31 @@ static void ip_expire(struct timer_list *t)
152152
153153 ipq_kill (qp );
154154 __IP_INC_STATS (net , IPSTATS_MIB_REASMFAILS );
155-
156- head = qp -> q .fragments ;
157-
158155 __IP_INC_STATS (net , IPSTATS_MIB_REASMTIMEOUT );
159156
160- if (!( qp -> q .flags & INET_FRAG_FIRST_IN ) || ! head )
157+ if (!qp -> q .flags & INET_FRAG_FIRST_IN )
161158 goto out ;
162159
160+ /* sk_buff::dev and sk_buff::rbnode are unionized. So we
161+ * pull the head out of the tree in order to be able to
162+ * deal with head->dev.
163+ */
164+ if (qp -> q .fragments ) {
165+ head = qp -> q .fragments ;
166+ qp -> q .fragments = head -> next ;
167+ } else {
168+ head = skb_rb_first (& qp -> q .rb_fragments );
169+ if (!head )
170+ goto out ;
171+ rb_erase (& head -> rbnode , & qp -> q .rb_fragments );
172+ memset (& head -> rbnode , 0 , sizeof (head -> rbnode ));
173+ barrier ();
174+ }
175+ if (head == qp -> q .fragments_tail )
176+ qp -> q .fragments_tail = NULL ;
177+
178+ sub_frag_mem_limit (qp -> q .net , head -> truesize );
179+
163180 head -> dev = dev_get_by_index_rcu (net , qp -> iif );
164181 if (!head -> dev )
165182 goto out ;
@@ -179,16 +196,16 @@ static void ip_expire(struct timer_list *t)
179196 (skb_rtable (head )-> rt_type != RTN_LOCAL ))
180197 goto out ;
181198
182- skb_get (head );
183199 spin_unlock (& qp -> q .lock );
184200 icmp_send (head , ICMP_TIME_EXCEEDED , ICMP_EXC_FRAGTIME , 0 );
185- kfree_skb (head );
186201 goto out_rcu_unlock ;
187202
188203out :
189204 spin_unlock (& qp -> q .lock );
190205out_rcu_unlock :
191206 rcu_read_unlock ();
207+ if (head )
208+ kfree_skb (head );
192209 ipq_put (qp );
193210}
194211
@@ -231,7 +248,7 @@ static int ip_frag_too_far(struct ipq *qp)
231248 end = atomic_inc_return (& peer -> rid );
232249 qp -> rid = end ;
233250
234- rc = qp -> q .fragments && (end - start ) > max ;
251+ rc = qp -> q .fragments_tail && (end - start ) > max ;
235252
236253 if (rc ) {
237254 struct net * net ;
@@ -245,28 +262,21 @@ static int ip_frag_too_far(struct ipq *qp)
245262
246263static int ip_frag_reinit (struct ipq * qp )
247264{
248- struct sk_buff * fp ;
249265 unsigned int sum_truesize = 0 ;
250266
251267 if (!mod_timer (& qp -> q .timer , jiffies + qp -> q .net -> timeout )) {
252268 refcount_inc (& qp -> q .refcnt );
253269 return - ETIMEDOUT ;
254270 }
255271
256- fp = qp -> q .fragments ;
257- do {
258- struct sk_buff * xp = fp -> next ;
259-
260- sum_truesize += fp -> truesize ;
261- kfree_skb (fp );
262- fp = xp ;
263- } while (fp );
272+ sum_truesize = skb_rbtree_purge (& qp -> q .rb_fragments );
264273 sub_frag_mem_limit (qp -> q .net , sum_truesize );
265274
266275 qp -> q .flags = 0 ;
267276 qp -> q .len = 0 ;
268277 qp -> q .meat = 0 ;
269278 qp -> q .fragments = NULL ;
279+ qp -> q .rb_fragments = RB_ROOT ;
270280 qp -> q .fragments_tail = NULL ;
271281 qp -> iif = 0 ;
272282 qp -> ecn = 0 ;
@@ -278,7 +288,8 @@ static int ip_frag_reinit(struct ipq *qp)
278288static int ip_frag_queue (struct ipq * qp , struct sk_buff * skb )
279289{
280290 struct net * net = container_of (qp -> q .net , struct net , ipv4 .frags );
281- struct sk_buff * prev , * next ;
291+ struct rb_node * * rbn , * parent ;
292+ struct sk_buff * skb1 ;
282293 struct net_device * dev ;
283294 unsigned int fragsize ;
284295 int flags , offset ;
@@ -341,58 +352,58 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
341352 if (err )
342353 goto err ;
343354
344- /* Find out which fragments are in front and at the back of us
345- * in the chain of fragments so far. We must know where to put
346- * this fragment, right?
347- */
348- prev = qp -> q .fragments_tail ;
349- if (!prev || prev -> ip_defrag_offset < offset ) {
350- next = NULL ;
351- goto found ;
352- }
353- prev = NULL ;
354- for (next = qp -> q .fragments ; next != NULL ; next = next -> next ) {
355- if (next -> ip_defrag_offset >= offset )
356- break ; /* bingo! */
357- prev = next ;
358- }
355+ /* Note : skb->rbnode and skb->dev share the same location. */
356+ dev = skb -> dev ;
357+ /* Makes sure compiler wont do silly aliasing games */
358+ barrier ();
359359
360- found :
361360 /* RFC5722, Section 4, amended by Errata ID : 3089
362361 * When reassembling an IPv6 datagram, if
363362 * one or more its constituent fragments is determined to be an
364363 * overlapping fragment, the entire datagram (and any constituent
365364 * fragments) MUST be silently discarded.
366365 *
367- * We do the same here for IPv4.
366+ * We do the same here for IPv4 (and increment an snmp counter) .
368367 */
369368
370- /* Is there an overlap with the previous fragment? */
371- if (prev &&
372- (prev -> ip_defrag_offset + prev -> len ) > offset )
373- goto discard_qp ;
374-
375- /* Is there an overlap with the next fragment? */
376- if (next && next -> ip_defrag_offset < end )
377- goto discard_qp ;
369+ /* Find out where to put this fragment. */
370+ skb1 = qp -> q .fragments_tail ;
371+ if (!skb1 ) {
372+ /* This is the first fragment we've received. */
373+ rb_link_node (& skb -> rbnode , NULL , & qp -> q .rb_fragments .rb_node );
374+ qp -> q .fragments_tail = skb ;
375+ } else if ((skb1 -> ip_defrag_offset + skb1 -> len ) < end ) {
376+ /* This is the common/special case: skb goes to the end. */
377+ /* Detect and discard overlaps. */
378+ if (offset < (skb1 -> ip_defrag_offset + skb1 -> len ))
379+ goto discard_qp ;
380+ /* Insert after skb1. */
381+ rb_link_node (& skb -> rbnode , & skb1 -> rbnode , & skb1 -> rbnode .rb_right );
382+ qp -> q .fragments_tail = skb ;
383+ } else {
384+ /* Binary search. Note that skb can become the first fragment, but
385+ * not the last (covered above). */
386+ rbn = & qp -> q .rb_fragments .rb_node ;
387+ do {
388+ parent = * rbn ;
389+ skb1 = rb_to_skb (parent );
390+ if (end <= skb1 -> ip_defrag_offset )
391+ rbn = & parent -> rb_left ;
392+ else if (offset >= skb1 -> ip_defrag_offset + skb1 -> len )
393+ rbn = & parent -> rb_right ;
394+ else /* Found an overlap with skb1. */
395+ goto discard_qp ;
396+ } while (* rbn );
397+ /* Here we have parent properly set, and rbn pointing to
398+ * one of its NULL left/right children. Insert skb. */
399+ rb_link_node (& skb -> rbnode , parent , rbn );
400+ }
401+ rb_insert_color (& skb -> rbnode , & qp -> q .rb_fragments );
378402
379- /* Note : skb->ip_defrag_offset and skb->dev share the same location */
380- dev = skb -> dev ;
381403 if (dev )
382404 qp -> iif = dev -> ifindex ;
383- /* Makes sure compiler wont do silly aliasing games */
384- barrier ();
385405 skb -> ip_defrag_offset = offset ;
386406
387- /* Insert this fragment in the chain of fragments. */
388- skb -> next = next ;
389- if (!next )
390- qp -> q .fragments_tail = skb ;
391- if (prev )
392- prev -> next = skb ;
393- else
394- qp -> q .fragments = skb ;
395-
396407 qp -> q .stamp = skb -> tstamp ;
397408 qp -> q .meat += skb -> len ;
398409 qp -> ecn |= ecn ;
@@ -414,7 +425,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
414425 unsigned long orefdst = skb -> _skb_refdst ;
415426
416427 skb -> _skb_refdst = 0UL ;
417- err = ip_frag_reasm (qp , prev , dev );
428+ err = ip_frag_reasm (qp , skb , dev );
418429 skb -> _skb_refdst = orefdst ;
419430 return err ;
420431 }
@@ -431,15 +442,15 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
431442 return err ;
432443}
433444
434-
435445/* Build a new IP datagram from all its fragments. */
436-
437- static int ip_frag_reasm (struct ipq * qp , struct sk_buff * prev ,
446+ static int ip_frag_reasm (struct ipq * qp , struct sk_buff * skb ,
438447 struct net_device * dev )
439448{
440449 struct net * net = container_of (qp -> q .net , struct net , ipv4 .frags );
441450 struct iphdr * iph ;
442- struct sk_buff * fp , * head = qp -> q .fragments ;
451+ struct sk_buff * fp , * head = skb_rb_first (& qp -> q .rb_fragments );
452+ struct sk_buff * * nextp ; /* To build frag_list. */
453+ struct rb_node * rbn ;
443454 int len ;
444455 int ihlen ;
445456 int err ;
@@ -453,25 +464,20 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
453464 goto out_fail ;
454465 }
455466 /* Make the one we just received the head. */
456- if (prev ) {
457- head = prev -> next ;
458- fp = skb_clone (head , GFP_ATOMIC );
467+ if (head != skb ) {
468+ fp = skb_clone (skb , GFP_ATOMIC );
459469 if (!fp )
460470 goto out_nomem ;
461-
462- fp -> next = head -> next ;
463- if (!fp -> next )
471+ rb_replace_node (& skb -> rbnode , & fp -> rbnode , & qp -> q .rb_fragments );
472+ if (qp -> q .fragments_tail == skb )
464473 qp -> q .fragments_tail = fp ;
465- prev -> next = fp ;
466-
467- skb_morph (head , qp -> q .fragments );
468- head -> next = qp -> q .fragments -> next ;
469-
470- consume_skb (qp -> q .fragments );
471- qp -> q .fragments = head ;
474+ skb_morph (skb , head );
475+ rb_replace_node (& head -> rbnode , & skb -> rbnode ,
476+ & qp -> q .rb_fragments );
477+ consume_skb (head );
478+ head = skb ;
472479 }
473480
474- WARN_ON (!head );
475481 WARN_ON (head -> ip_defrag_offset != 0 );
476482
477483 /* Allocate a new buffer for the datagram. */
@@ -496,24 +502,35 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
496502 clone = alloc_skb (0 , GFP_ATOMIC );
497503 if (!clone )
498504 goto out_nomem ;
499- clone -> next = head -> next ;
500- head -> next = clone ;
501505 skb_shinfo (clone )-> frag_list = skb_shinfo (head )-> frag_list ;
502506 skb_frag_list_init (head );
503507 for (i = 0 ; i < skb_shinfo (head )-> nr_frags ; i ++ )
504508 plen += skb_frag_size (& skb_shinfo (head )-> frags [i ]);
505509 clone -> len = clone -> data_len = head -> data_len - plen ;
506- head -> data_len -= clone -> len ;
507- head -> len -= clone -> len ;
510+ skb -> truesize += clone -> truesize ;
508511 clone -> csum = 0 ;
509512 clone -> ip_summed = head -> ip_summed ;
510513 add_frag_mem_limit (qp -> q .net , clone -> truesize );
514+ skb_shinfo (head )-> frag_list = clone ;
515+ nextp = & clone -> next ;
516+ } else {
517+ nextp = & skb_shinfo (head )-> frag_list ;
511518 }
512519
513- skb_shinfo (head )-> frag_list = head -> next ;
514520 skb_push (head , head -> data - skb_network_header (head ));
515521
516- for (fp = head -> next ; fp ; fp = fp -> next ) {
522+ /* Traverse the tree in order, to build frag_list. */
523+ rbn = rb_next (& head -> rbnode );
524+ rb_erase (& head -> rbnode , & qp -> q .rb_fragments );
525+ while (rbn ) {
526+ struct rb_node * rbnext = rb_next (rbn );
527+ fp = rb_to_skb (rbn );
528+ rb_erase (rbn , & qp -> q .rb_fragments );
529+ rbn = rbnext ;
530+ * nextp = fp ;
531+ nextp = & fp -> next ;
532+ fp -> prev = NULL ;
533+ memset (& fp -> rbnode , 0 , sizeof (fp -> rbnode ));
517534 head -> data_len += fp -> len ;
518535 head -> len += fp -> len ;
519536 if (head -> ip_summed != fp -> ip_summed )
@@ -524,7 +541,9 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
524541 }
525542 sub_frag_mem_limit (qp -> q .net , head -> truesize );
526543
544+ * nextp = NULL ;
527545 head -> next = NULL ;
546+ head -> prev = NULL ;
528547 head -> dev = dev ;
529548 head -> tstamp = qp -> q .stamp ;
530549 IPCB (head )-> frag_max_size = max (qp -> max_df_size , qp -> q .max_size );
@@ -552,6 +571,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
552571
553572 __IP_INC_STATS (net , IPSTATS_MIB_REASMOKS );
554573 qp -> q .fragments = NULL ;
574+ qp -> q .rb_fragments = RB_ROOT ;
555575 qp -> q .fragments_tail = NULL ;
556576 return 0 ;
557577
0 commit comments