zotac:~/tmp$ git clone http://iki.fi/lindi/git/sendfileudp.git/
Cloning into sendfileudp...
zotac:~/tmp$ cd sendfileudp
zotac:~/tmp/sendfileudp$ make
make -C /lib/modules/2.6.32-5-amd64/build M=/home/lindi/tmp/sendfileudp modules
make[1]: Entering directory `/usr/src/linux-headers-2.6.32-5-amd64'
  CC [M]  /home/lindi/tmp/sendfileudp/sendfileudp.o
  Building modules, stage 2.
  MODPOST 1 modules
  CC      /home/lindi/tmp/sendfileudp/sendfileudp.mod.o
  LD [M]  /home/lindi/tmp/sendfileudp/sendfileudp.ko
make[1]: Leaving directory `/usr/src/linux-headers-2.6.32-5-amd64'
gcc -g -O2 -Wall -o sendfileudp_test sendfileudp_test.c
zotac:~/tmp/sendfileudp$ ./sendfileudp_test_execute.sh
version 7808a72fd73703003cf2fc0de81eb7e3885a177b
source zotac mtu 9000 target 10.99.0.1 port 1234
filename /dev/md0 len 10737418240 packetsize 61440
==== seq 0
==== method read-send
==== cmd ./sendfileudp_test -f /dev/md0 -l 10737418240 -s 61440 -m read-send 10.99.0.1 1234
real 17.62 user 0.02 sys 7.93 cpu 45% fault 0/189
real 17.29 user 0.01 sys 8.01 cpu 46% fault 0/191
real 17.68 user 0.01 sys 7.93 cpu 44% fault 0/190
==== method send-sendfile
==== cmd ./sendfileudp_test -f /dev/md0 -l 10737418240 -s 61440 -m send-sendfile 10.99.0.1 1234
real 16.20 user 0.00 sys 6.39 cpu 39% fault 0/174
real 15.92 user 0.00 sys 6.36 cpu 39% fault 0/174
real 15.83 user 0.00 sys 6.40 cpu 40% fault 0/176
==== seq 1
==== method read-send
==== cmd ./sendfileudp_test -f /dev/md0 -l 10737418240 -s 61440 -m read-send 10.99.0.1 1234 -q 0
real 17.18 user 0.00 sys 8.30 cpu 48% fault 0/192
real 17.13 user 0.03 sys 8.30 cpu 48% fault 0/191
real 16.95 user 0.00 sys 8.39 cpu 49% fault 0/192
==== method send-sendfile
==== cmd ./sendfileudp_test -f /dev/md0 -l 10737418240 -s 61440 -m send-sendfile 10.99.0.1 1234 -q 0
real 16.18 user 0.02 sys 6.43 cpu 39% fault 0/173
real 16.65 user 0.00 sys 6.43 cpu 38% fault 0/174
real 16.02 user 0.02 sys 6.43 cpu 40% fault 0/174

# (8*10737418240/16.0)/(1000*1000*1000) = 5.4 (gigabits per second)

# Extra setup on zotac:
# sudo chmod a+r /dev/md0
# ip link set eth3 mtu 9000

# Extra setup on 10.9.0.1:
# sudo iptables -A INPUT -p udp --dport 1234 -j DROP
# ip link set eth4 mtu 9000

# perf record -g -- ./sendfileudp_test -f /dev/md0 -l 10737418240 -s 61440 -m send-sendfile 10.99.0.1 1234
# mv perf.data send-sendfile.data
# perf report -i send-sendfile.data -g
# perf report -i send-sendfile.data -g flat,0
    32.88%  sendfileudp_tes  [kernel]                   [k] csum_partial
            32.88%
                csum_partial
                sendfile64
     2.81%  sendfileudp_tes  [kernel]                   [k] _spin_lock
# sudo stap -e 'global i; probe kernel.function("csum_partial") { if (i++ == 10000) { print_backtrace(); exit(); } }'
 0xffffffff81194d78 : csum_partial+0x0/0x14f [kernel]
 0xffffffff81274b87 : ip_append_page+0x3b8/0x48d [kernel]
 0xffffffff812932e5 : udp_sendpage+0xac/0x12c [kernel]
 0xffffffff812988f2 : inet_sendpage+0x5c/0x91 [kernel]
 0xffffffff8123f60d : kernel_sendpage+0x16/0x1f [kernel]
 0xffffffff8123f64b : sock_sendpage+0x35/0x3a [kernel]
 0xffffffff81109632 : pipe_to_sendpage+0x5a/0x61 [kernel]
 0xffffffff81109685 : splice_from_pipe_feed+0x4c/0xd4 [kernel]
 0xffffffff8110990e : __splice_from_pipe+0x3e/0x69 [kernel]
 0xffffffff81109986 : splice_from_pipe+0x4d/0x63 [kernel]
 0xffffffff8110a4df : direct_splice_actor+0x1b/0x1e [kernel]
 0xffffffff81109ecb : splice_direct_to_actor+0xdc/0x188 [kernel]
 0xffffffff81109fc1 : do_splice_direct+0x4a/0x64 [kernel]
 0xffffffff810ee8c5 : do_sendfile+0x148/0x1ce [kernel]
 0xffffffff810ee9b9 : sys_sendfile64+0x6e/0x89 [kernel]
 0xffffffff81010b42 : system_call_fastpath+0x16/0x1b [kernel]
$ sudo ethtool -k eth3
Offload parameters for eth3:
rx-checksumming: on
tx-checksumming: on
scatter-gather: on
tcp-segmentation-offload: on
udp-fragmentation-offload: off
generic-segmentation-offload: on
generic-receive-offload: on
large-receive-offload: off
ntuple-filters: off
receive-hashing: off

could we disable csum_partial?
# sudo stap -g -e 'probe kernel.function("csum_partial") { if (execname() == "sendfileudp_tes") { $len = 0; } }'
does not seem to affect real/sys time but perf does not show
csum_partial taking so much cpu anymore. Instead we see
     8.89%  sendfileudp_tes  [kernel]                   [k] 0xffffffffa03120d0
             4.42%
                0xffffffffa03120d0
                sendfile64

The fragment in ip_output.c looks suspicious:
if (transhdrlen &&
    length + fragheaderlen <= mtu &&
    rt->u.dst.dev->features & NETIF_F_V4_CSUM &&
    !exthdrlen)
        csummode = CHECKSUM_PARTIAL;
# perf record -g -- ./sendfileudp_test -f /dev/md0 -l 10737418240 -s 8900 -m send-sendfile 10.99.0.1 1234
# perf report -g flat,0
does not show csum_partial and top shows only around 30%. using -s 9000 does show it.
See http://book.chinaunix.net/special/ebook/oreilly/Understanding_Linux_Network_Internals/0596002556/understandlni-CHP-19-SECT-1.html

2.6.32
zotac:~/tmp/sendfileudp$ ./sendfileudp_test_execute.sh
version f6a8f7d0027ac43bbdcc2e668db3d65dc9cf92bb 
source zotac mtu 9000 dev eth3
Offload parameters for eth3:
Cannot get device GRO settings: Operation not permitted
rx-checksumming: on
tx-checksumming: on
scatter-gather: on
tcp-segmentation-offload: on
udp-fragmentation-offload: off
generic-segmentation-offload: on
generic-receive-offload: off
large-receive-offload: off
ntuple-filters: off
receive-hashing: off
target 10.99.0.1 port 1234
filename /dev/md0 len 10737418240 packetsize 61440
== ./sendfileudp_test -f /dev/md0 -l 10737418240 -s 61440 -m read-send 10.99.0.1 1234
real 18.63 user 0.00 sys 7.97 cpu 42% fault 0/188
real 17.70 user 0.01 sys 7.98 cpu 45% fault 0/188
real 16.71 user 0.02 sys 7.97 cpu 47% fault 0/188
== ./sendfileudp_test -f /dev/md0 -l 10737418240 -s 61440 -m read-send 10.99.0.1 1234 -q 0
real 17.41 user 0.00 sys 8.30 cpu 47% fault 0/190
real 17.65 user 0.02 sys 8.32 cpu 47% fault 0/188
real 17.57 user 0.02 sys 8.30 cpu 47% fault 0/188
== ./sendfileudp_test -f /dev/md0 -l 10737418240 -s 61440 -m send-sendfile 10.99.0.1 1234
real 16.44 user 0.01 sys 6.35 cpu 38% fault 0/172
real 16.19 user 0.01 sys 6.38 cpu 39% fault 0/173
real 16.12 user 0.01 sys 6.39 cpu 39% fault 0/173
== ./sendfileudp_test -f /dev/md0 -l 10737418240 -s 61440 -m send-sendfile 10.99.0.1 1234 -q 0
real 15.96 user 0.02 sys 6.38 cpu 40% fault 0/172
real 15.99 user 0.00 sys 6.40 cpu 40% fault 0/173
real 16.26 user 0.02 sys 6.40 cpu 39% fault 0/173
== ./sendfileudp_test -f /dev/md0 -l 10737418240 -s 8900 -m read-send 10.99.0.1 1234
real 16.77 user 0.02 sys 8.35 cpu 49% fault 0/175
real 16.92 user 0.08 sys 8.24 cpu 49% fault 0/176
real 16.62 user 0.08 sys 8.24 cpu 50% fault 0/177
== ./sendfileudp_test -f /dev/md0 -l 10737418240 -s 8900 -m read-send 10.99.0.1 1234 -q 0
real 17.26 user 0.06 sys 8.31 cpu 48% fault 0/176
real 17.19 user 0.06 sys 8.25 cpu 48% fault 0/176
real 17.08 user 0.05 sys 8.34 cpu 49% fault 0/176
== ./sendfileudp_test -f /dev/md0 -l 10737418240 -s 8900 -m send-sendfile 10.99.0.1 1234
real 15.90 user 0.04 sys 4.74 cpu 30% fault 0/173
real 16.53 user 0.03 sys 4.74 cpu 28% fault 0/171
real 16.36 user 0.03 sys 4.82 cpu 29% fault 0/172
== ./sendfileudp_test -f /dev/md0 -l 10737418240 -s 8900 -m send-sendfile 10.99.0.1 1234 -q 0
real 16.36 user 0.06 sys 4.91 cpu 30% fault 0/171
real 15.95 user 0.07 sys 4.87 cpu 31% fault 0/173
real 16.01 user 0.04 sys 4.94 cpu 31% fault 0/172




now largest perf node is:
3.64%  sendfileudp_tes  [kernel]                   [k] _spin_lock
                      |--95.33%-- sendfile64
                      |--4.03%-- close
                       --0.64%-- send
sudo stap -e 'probe kernel.function("get_page_from_freelist") { if (execname() == "sendfileudp_tes") { print_backtrace(); } }'
 0xffffffff810b9295 : get_page_from_freelist+0x0/0x760 [kernel]
 0xffffffff810b9d5c : __alloc_pages_nodemask+0x11c/0x5f4 [kernel]
 0xffffffff810bb689 : __do_page_cache_readahead+0x9b/0x1b4 [kernel]
 0xffffffff810bb7be : ra_submit+0x1c/0x20 [kernel]
 0xffffffff810bbaad : page_cache_async_readahead+0x75/0xad [kernel]
 0xffffffff8110b37b : __generic_file_splice_read+0x20b/0x3bd [kernel]
 0xffffffff8110b567 : generic_file_splice_read+0x3a/0x63 [kernel]
 0xffffffff81109ead : splice_direct_to_actor+0xbe/0x188 [kernel]
 0xffffffff81109fc1 : do_splice_direct+0x4a/0x64 [kernel]
 0xffffffff810ee8c5 : do_sendfile+0x148/0x1ce [kernel]
 0xffffffff810ee9b9 : sys_sendfile64+0x6e/0x89 [kernel]
 0xffffffff81010b42 : system_call_fastpath+0x16/0x1b [kernel]

$ dpkg-query -W linux-image-$(uname -r)
linux-image-2.6.36-trunk-amd64  2.6.36-1~experimental.1
zotac:~/tmp/sendfileudp$ ./sendfileudp_test_execute.sh 
version f6a8f7d0027ac43bbdcc2e668db3d65dc9cf92bb 
source zotac mtu 9000 dev eth3
Offload parameters for eth3:
rx-checksumming: on
tx-checksumming: on
scatter-gather: on
tcp-segmentation-offload: on
udp-fragmentation-offload: off
generic-segmentation-offload: on
generic-receive-offload: on
large-receive-offload: off
ntuple-filters: off
receive-hashing: off
target 10.99.0.1 port 1234
filename /dev/md0 len 10737418240 packetsize 61440
== ./sendfileudp_test -f /dev/md0 -l 10737418240 -s 61440 -m read-send 10.99.0.1 1234
real 18.67 user 0.02 sys 9.90 cpu 53% fault 0/188
real 18.29 user 0.02 sys 9.83 cpu 53% fault 0/188
real 17.74 user 0.01 sys 9.84 cpu 55% fault 0/188
== ./sendfileudp_test -f /dev/md0 -l 10737418240 -s 61440 -m read-send 10.99.0.1 1234 -q 0
real 18.19 user 0.00 sys 10.22 cpu 56% fault 0/189
real 18.07 user 0.00 sys 10.04 cpu 55% fault 0/188
real 18.38 user 0.03 sys 10.16 cpu 55% fault 0/189
== ./sendfileudp_test -f /dev/md0 -l 10737418240 -s 61440 -m send-sendfile 10.99.0.1 1234
real 16.68 user 0.00 sys 8.58 cpu 51% fault 0/172
real 16.70 user 0.00 sys 8.62 cpu 51% fault 0/172
real 17.35 user 0.01 sys 8.53 cpu 49% fault 0/173
== ./sendfileudp_test -f /dev/md0 -l 10737418240 -s 61440 -m send-sendfile 10.99.0.1 1234 -q 0
real 16.79 user 0.04 sys 8.58 cpu 51% fault 0/171
real 16.38 user 0.01 sys 8.62 cpu 52% fault 0/173
real 16.72 user 0.02 sys 8.69 cpu 52% fault 0/171
== ./sendfileudp_test -f /dev/md0 -l 10737418240 -s 8900 -m read-send 10.99.0.1 1234
real 17.89 user 0.06 sys 9.92 cpu 55% fault 0/176
real 18.74 user 0.09 sys 10.08 cpu 54% fault 0/176
real 17.35 user 0.06 sys 9.97 cpu 57% fault 0/177
== ./sendfileudp_test -f /dev/md0 -l 10737418240 -s 8900 -m read-send 10.99.0.1 1234 -q 0
real 17.57 user 0.06 sys 9.92 cpu 56% fault 0/176
real 18.23 user 0.10 sys 9.98 cpu 55% fault 0/176
real 18.12 user 0.04 sys 10.06 cpu 55% fault 0/175
== ./sendfileudp_test -f /dev/md0 -l 10737418240 -s 8900 -m send-sendfile 10.99.0.1 1234
real 16.60 user 0.04 sys 7.74 cpu 46% fault 0/172
real 17.17 user 0.06 sys 7.94 cpu 46% fault 0/173
real 16.94 user 0.06 sys 7.58 cpu 45% fault 0/172
== ./sendfileudp_test -f /dev/md0 -l 10737418240 -s 8900 -m send-sendfile 10.99.0.1 1234 -q 0
real 16.22 user 0.11 sys 7.70 cpu 48% fault 0/172
real 17.06 user 0.08 sys 7.69 cpu 45% fault 0/172
real 16.68 user 0.11 sys 7.79 cpu 47% fault 0/172
     4.44%  sendfileudp_tes  [kernel.kallsyms]  [k] kmem_cache_alloc
              |
              --- kmem_cache_alloc
                  0x7f1dae61e5ba

     3.52%  sendfileudp_tes  [kernel.kallsyms]  [k] get_page_from_freelist
              |
              --- get_page_from_freelist
                  0x7f1dae61e5ba

     3.29%  sendfileudp_tes  [kernel.kallsyms]  [k] _raw_spin_lock
              |
              --- _raw_spin_lock
                 |
                 |--93.39%-- 0x7f1dae61e5ba
                 |
                  --6.61%-- close

     2.72%  sendfileudp_tes  [kernel.kallsyms]  [k] ip_append_page
0xffffffff810e63be : kmem_cache_alloc+0x0/0xea [kernel]
0xffffffff810b1919 : mempool_alloc+0x53/0x10e [kernel]
0xffffffff8117d527 : get_request+0x1b3/0x2b7 [kernel]
0xffffffff8117dc09 : get_request_wait+0x21/0x17d [kernel]
0xffffffff8117e236 : __make_request+0x325/0x472 [kernel]
0xffffffff8117ca72 : generic_make_request+0x2a0/0x31d [kernel]
0xffffffff8117cbbd : submit_bio+0xce/0xea [kernel]
0xffffffff8110ca52 : submit_bh+0xe5/0x105 [kernel]
0xffffffff8110f67a : block_read_full_page+0x1d7/0x1f3 [kernel]
0xffffffff810b7b17 : __do_page_cache_readahead+0x180/0x1b3 [kernel]
0xffffffff810b7b66 : ra_submit+0x1c/0x20 [kernel]
0xffffffff8110af92 : __generic_file_splice_read+0x10f/0x401 [kernel]
0xffffffff8110b2bf : generic_file_splice_read+0x3b/0x64 [kernel]
0xffffffff81109a62 : splice_direct_to_actor+0xba/0x17b [kernel]
0xffffffff81109b68 : do_splice_direct+0x45/0x58 [kernel]
0xffffffff810eca06 : do_sendfile+0x12c/0x1b6 [kernel]
0xffffffff810ecb00 : sys_sendfile64+0x70/0x8a [kernel]
0xffffffff81008a02 : system_call_fastpath+0x16/0x1b [kernel]

sudo opcontrol --vmlinux=/usr/lib/debug/boot/vmlinux-2.6.32-5-amd64
sudo opcontrol --start
./sendfileudp_test -f /dev/md0 -l 10737418240 -s 8900 -m send-sendfile 10.99.0.1 1234
sudo opcontrol --stop
opreport -l -p /lib/modules/2.6.32-5-amd64
CPU: Intel Architectural Perfmon, speed 1200 MHz (estimated)
Counted CPU_CLK_UNHALTED events (Clock cycles when not halted) with a unit mask of 0x00 (No unit mask) count 100000
samples  %        image name               app name                 symbol name
6595      3.0346  vmlinux-2.6.32-5-amd64   vmlinux-2.6.32-5-amd64   kmem_cache_alloc
5943      2.7346  vmlinux-2.6.32-5-amd64   vmlinux-2.6.32-5-amd64   get_page_from_freelist
5577      2.5661  vmlinux-2.6.32-5-amd64   vmlinux-2.6.32-5-amd64   create_empty_buffers
5269      2.4244  vmlinux-2.6.32-5-amd64   vmlinux-2.6.32-5-amd64   end_buffer_async_read
5062      2.3292  vmlinux-2.6.32-5-amd64   vmlinux-2.6.32-5-amd64   drop_buffers
4381      2.0158  vmlinux-2.6.32-5-amd64   vmlinux-2.6.32-5-amd64   put_page
4380      2.0154  vmlinux-2.6.32-5-amd64   vmlinux-2.6.32-5-amd64   ip_append_page
4072      1.8736  vmlinux-2.6.32-5-amd64   vmlinux-2.6.32-5-amd64   __slab_free
3149      1.4489  raid0.ko                 raid0.ko                 raid0_make_request
3079      1.4167  vmlinux-2.6.32-5-amd64   vmlinux-2.6.32-5-amd64   add_to_page_cache_locked
3018      1.3887  vmlinux-2.6.32-5-amd64   vmlinux-2.6.32-5-amd64   sch_direct_xmit
2978      1.3703  vmlinux-2.6.32-5-amd64   vmlinux-2.6.32-5-amd64   __make_request
2798      1.2874  vmlinux-2.6.32-5-amd64   vmlinux-2.6.32-5-amd64   radix_tree_delete
2793      1.2851  vmlinux-2.6.32-5-amd64   vmlinux-2.6.32-5-amd64   __rmqueue
2757      1.2686  vmlinux-2.6.32-5-amd64   vmlinux-2.6.32-5-amd64   radix_tree_insert
2739      1.2603  vmlinux-2.6.32-5-amd64   vmlinux-2.6.32-5-amd64   __remove_mapping
2609      1.2005  vmlinux-2.6.32-5-amd64   vmlinux-2.6.32-5-amd64   generic_make_request
2578      1.1862  vmlinux-2.6.32-5-amd64   vmlinux-2.6.32-5-amd64   block_read_full_page
2556      1.1761  vmlinux-2.6.32-5-amd64   vmlinux-2.6.32-5-amd64   kmem_cache_free
2475      1.1388  vmlinux-2.6.32-5-amd64   vmlinux-2.6.32-5-amd64   find_get_pages_contig
2388      1.0988  vmlinux-2.6.32-5-amd64   vmlinux-2.6.32-5-amd64   shrink_page_list
2386      1.0979  vmlinux-2.6.32-5-amd64   vmlinux-2.6.32-5-amd64   isolate_pages_global
2314      1.0647  vmlinux-2.6.32-5-amd64   vmlinux-2.6.32-5-amd64   submit_bh

# perf record -o 8900.data -g -- ./sendfileudp_test -f /dev/md0 -l 10737418240 -s 8900 -m send-sendfile 10.99.0.1 1234
# perf record -o 9000.data -g -- ./sendfileudp_test -f /dev/md0 -l 10737418240 -s 9000 -m send-sendfile 10.99.0.1 1234
# perf report -i 8900.data|grep -v "^#"|head
     6.30%  sendfileudp_tes  [kernel]                   [k] get_page_from_freelist
                |
                |--99.98%-- sendfile64
                 --0.02%-- [...]

     6.19%  sendfileudp_tes  [kernel]                   [k] __make_request

     5.34%  sendfileudp_tes  [kernel]                   [k] kmem_cache_alloc

     4.31%  sendfileudp_tes  [kernel]                   [k] add_to_page_cache_locked
# perf report -i 9000.data|grep -v "^#"|head
    24.33%  sendfileudp_tes  [kernel]                   [k] csum_partial

     4.38%  sendfileudp_tes  [kernel]                   [k] get_page_from_freelist

     4.17%  sendfileudp_tes  [kernel]                   [k] kmem_cache_alloc

     3.96%  sendfileudp_tes  [kernel]                   [k] _spin_lock
                |
                 --100.00%-- sendfile64


sudo stap -e 'probe kernel.function("csum_partial") { if (execname() == "sendfileudp_tes") { printf("%s\n", $$parms); print_backtrace(); } } probe begin { printf("R\n"); }'
sudo stap -e 'global sum; probe kernel.function("csum_partial") { if (execname() == "sendfileudp_tes") { sum += $len; printf("%s %d\n", $$parms, sum);} } probe begin { printf("ready\n"); }'

./sendfileudp_test -f /dev/md0 -l 10737418240 -m send-sendfile -r 10000 10.99.0.1 1234 -s 60000

buff=0xffff88002dede308 len=0xcf8 sum=0x0 3320
buff=0xffff88002dedf000 len=0x1000 sum=0x0 7416
buff=0xffff880030835000 len=0x618 sum=0x0 8976
buff=0xffff880030835618 len=0x9e8 sum=0x0 11512
buff=0xffff880030834000 len=0x1000 sum=0x0 15608
buff=0xffff880026671000 len=0x928 sum=0x0 17952
buff=0xffff880026671928 len=0x6d8 sum=0x0 19704
buff=0xffff880026670000 len=0x1000 sum=0x0 23800
buff=0xffff8800b7f9d000 len=0xc38 sum=0x0 26928
buff=0xffff8800b7f9dc38 len=0x3c8 sum=0x0 27896
buff=0xffff8800212cf000 len=0x1000 sum=0x0 31992
buff=0xffff8800212ce000 len=0xf48 sum=0x0 35904
buff=0xffff8800212cef48 len=0xb8 sum=0x0 36088
buff=0xffff8800306dc000 len=0x1000 sum=0x0 40184
buff=0xffff8800306dd000 len=0x1000 sum=0x0 44280
buff=0xffff8800407d4000 len=0x258 sum=0x0 44880
buff=0xffff8800407d4258 len=0xda8 sum=0x0 48376
buff=0xffff88002f98b000 len=0xa60 sum=0x0 51032


./sendfileudp_test -f /dev/md0 -l 10737418240 -m send-sendfile -r 10000 10.99.0.1 1234 -s 60000 -c

buff=0xffff88002dede308 len=0xcf8 sum=0x0 3320
buff=0xffff88002dedf000 len=0x1000 sum=0x0 7416
buff=0xffff880030835000 len=0x618 sum=0x0 8976
buff=0xffff880030835618 len=0x9e8 sum=0x0 11512
buff=0xffff880030834000 len=0x1000 sum=0x0 15608
buff=0xffff880026671000 len=0x928 sum=0x0 17952
buff=0xffff880026671928 len=0x6d8 sum=0x0 19704
buff=0xffff880026670000 len=0x1000 sum=0x0 23800
buff=0xffff8800b7f9d000 len=0xc38 sum=0x0 26928
buff=0xffff8800b7f9dc38 len=0x3c8 sum=0x0 27896
buff=0xffff8800212cf000 len=0x1000 sum=0x0 31992
buff=0xffff8800212ce000 len=0xf48 sum=0x0 35904
buff=0xffff8800212cef48 len=0xb8 sum=0x0 36088
buff=0xffff8800306dc000 len=0x1000 sum=0x0 40184
buff=0xffff8800306dd000 len=0x1000 sum=0x0 44280
buff=0xffff8800407d4000 len=0x258 sum=0x0 44880
buff=0xffff8800407d4258 len=0xda8 sum=0x0 48376          # wtf?
buff=0xffff88002f98b000 len=0xa60 sum=0x0 51032          #
buff=0xffff880135556424 len=0x8 sum=0x0 51040
buff=0xffff880101abb000 len=0x1000 sum=0x0 55136
buff=0xffff880101abd000 len=0x1000 sum=0x0 59232
buff=0xffff88002dede000 len=0x308 sum=0x0 60008

./sendfileudp_test -f /dev/md0 -l 10737418240 -m send-sendfile -r 10000 10.99.0.1 1234 -s 9000

buff=0xffff8800407d42d0 len=0x20 sum=0x0 32

./sendfileudp_test -f /dev/md0 -l 10737418240 -m send-sendfile -r 10000 10.99.0.1 1234 -s 9000 -c

buff=0xffff880030835630 len=0x20 sum=0x0 32
buff=0xffff88005ce77024 len=0x8 sum=0x0 40
buff=0xffff88002dede328 len=0xcd8 sum=0x0 3328
buff=0xffff88002dedf000 len=0x1000 sum=0x0 7424
buff=0xffff880030835000 len=0x630 sum=0x0 9008

./sendfileudp_test -f /dev/md0 -l 10737418240 -m send-sendfile -r 10000 10.99.0.1 1234 -s 9001 -c

buff=0xffff88002dede308 len=0x21 sum=0x0 33
buff=0xffff88011555e424 len=0x8 sum=0x0 41
buff=0xffff880101abb000 len=0x1000 sum=0x0 4137
buff=0xffff880101abd000 len=0x1000 sum=0x0 8233
buff=0xffff88002dede000 len=0x308 sum=0x0 9009

probe kernel.statement("ip_append_page@net/ipv4/ip_output.c:1155") {
    $skb->ip_summed = 3;
}
avoids checksumming with 64000 packets

31.04%  sendfileudp_tes  [kernel]                   [k] csum_partial

 2.84%  sendfileudp_tes  [kernel]                   [k] get_page_from_freelist

 2.69%  sendfileudp_tes  [kernel]                   [k] _spin_lock
            |
                       |
                       |--95.76%-- sendfile64
                       |
                        --4.24%-- close

 2.12%  sendfileudp_tes  [kernel]                   [k] ip_append_page

 1.82%  sendfileudp_tes  [kernel]                   [k] kmem_cache_alloc
            |

vs.

4.60%  sendfileudp_tes  [kernel]                   [k] _spin_lock
           |
                      |
                      |--96.59%-- sendfile64
                      |
                       --3.41%-- close

3.92%  sendfileudp_tes  [kernel]                   [k] get_page_from_freelist

3.21%  sendfileudp_tes  [kernel]                   [k] ua_init     [iscsi_trgt]

3.08%  sendfileudp_tes  [kernel]                   [k] ip_append_page

2.39%  sendfileudp_tes  [kernel]                   [k] put_page



zotac:~/tmp/sendfileudp$ ./sendfileudp_test_execute.sh
version a4099bdfc7f3298dda2ea10be38a79fae0e3d211 
source zotac mtu 9000 dev eth3 kernel 2.6.32-5-amd64
Offload parameters for eth3:
Cannot get device GRO settings: Operation not permitted
rx-checksumming: on
tx-checksumming: on
scatter-gather: on
tcp-segmentation-offload: on
udp-fragmentation-offload: off
generic-segmentation-offload: on
generic-receive-offload: off
large-receive-offload: off
ntuple-filters: off
receive-hashing: off
cmd_base ./sendfileudp_test -f/dev/md0 -l10737418240 10.99.0.1 1234
# (stap)-s60000 -msend-sendfile
[sudo] password for lindi: 
real 16.91 user 0.00 sys 4.52 cpu 26% fault 0/173
real 16.90 user 0.01 sys 4.45 cpu 26% fault 0/171
real 15.86 user 0.00 sys 4.44 cpu 28% fault 0/171
# -s60000 -msend-sendfile -c
real 15.77 user 0.00 sys 6.40 cpu 40% fault 0/172
real 16.48 user 0.00 sys 6.38 cpu 38% fault 0/171
real 16.21 user 0.01 sys 6.41 cpu 39% fault 0/171
# -s60000 -msend-sendfile
real 16.27 user 0.01 sys 6.00 cpu 36% fault 0/172
real 16.04 user 0.00 sys 6.03 cpu 37% fault 0/172
real 16.27 user 0.00 sys 6.07 cpu 37% fault 0/173
# (stap)-s60000 -msend-sendfile -q0
real 15.98 user 0.00 sys 4.50 cpu 28% fault 0/172
real 16.08 user 0.01 sys 4.47 cpu 27% fault 0/171
real 16.15 user 0.00 sys 4.50 cpu 27% fault 0/171
# -s60000 -msend-sendfile -q0 -c
real 16.08 user 0.01 sys 6.53 cpu 40% fault 0/172
real 15.85 user 0.01 sys 6.44 cpu 40% fault 0/173
real 16.31 user 0.02 sys 6.48 cpu 39% fault 0/171
# -s60000 -msend-sendfile -q0
real 16.12 user 0.01 sys 6.06 cpu 37% fault 0/172
real 15.72 user 0.00 sys 6.12 cpu 39% fault 0/171
real 16.27 user 0.01 sys 6.10 cpu 37% fault 0/173
# (stap)-s60000 -mread-send
real 17.52 user 0.02 sys 7.94 cpu 45% fault 0/189
real 17.37 user 0.01 sys 7.92 cpu 45% fault 0/188
real 17.22 user 0.01 sys 8.01 cpu 46% fault 0/188
# -s60000 -mread-send -c
real 17.15 user 0.02 sys 7.99 cpu 46% fault 0/189
real 17.23 user 0.01 sys 7.99 cpu 46% fault 0/188
real 17.28 user 0.01 sys 8.01 cpu 46% fault 0/188
# -s60000 -mread-send
real 17.06 user 0.03 sys 7.97 cpu 46% fault 0/187
real 16.85 user 0.01 sys 7.98 cpu 47% fault 0/188
real 16.81 user 0.00 sys 8.04 cpu 47% fault 0/188
# (stap)-s60000 -mread-send -q0
real 17.86 user 0.01 sys 7.96 cpu 44% fault 0/188
real 17.28 user 0.01 sys 7.97 cpu 46% fault 0/187
real 17.35 user 0.01 sys 7.97 cpu 46% fault 0/188
# -s60000 -mread-send -q0 -c
real 17.02 user 0.01 sys 7.99 cpu 47% fault 0/188
real 16.98 user 0.02 sys 7.98 cpu 47% fault 0/187
real 17.23 user 0.01 sys 8.01 cpu 46% fault 0/187
# -s60000 -mread-send -q0
real 17.11 user 0.02 sys 7.96 cpu 46% fault 0/190
real 17.71 user 0.01 sys 8.00 cpu 45% fault 0/188
real 16.59 user 0.01 sys 8.05 cpu 48% fault 0/188
# (stap)-s61440 -msend-sendfile
real 16.66 user 0.01 sys 4.38 cpu 26% fault 0/171
real 16.38 user 0.00 sys 4.39 cpu 26% fault 0/172
real 16.14 user 0.00 sys 4.36 cpu 27% fault 0/172
# -s61440 -msend-sendfile -c
real 16.42 user 0.01 sys 6.41 cpu 39% fault 0/173
real 16.33 user 0.01 sys 6.35 cpu 38% fault 0/171
real 16.61 user 0.00 sys 6.34 cpu 38% fault 0/171
# -s61440 -msend-sendfile
real 15.92 user 0.00 sys 6.00 cpu 37% fault 0/172
real 15.89 user 0.01 sys 5.94 cpu 37% fault 0/173
real 16.11 user 0.01 sys 6.02 cpu 37% fault 0/172
# (stap)-s61440 -msend-sendfile -q0
real 16.28 user 0.00 sys 4.39 cpu 27% fault 0/173
real 16.16 user 0.01 sys 4.44 cpu 27% fault 0/171
real 16.13 user 0.00 sys 4.49 cpu 27% fault 0/172
# -s61440 -msend-sendfile -q0 -c
real 16.44 user 0.00 sys 6.39 cpu 38% fault 0/172
real 15.97 user 0.00 sys 6.37 cpu 39% fault 0/172
real 15.90 user 0.00 sys 6.36 cpu 40% fault 0/172
# -s61440 -msend-sendfile -q0
real 16.40 user 0.00 sys 6.02 cpu 36% fault 0/172
real 15.75 user 0.00 sys 6.00 cpu 38% fault 0/172
real 16.03 user 0.00 sys 6.02 cpu 37% fault 0/172
# (stap)-s61440 -mread-send
real 17.15 user 0.00 sys 7.97 cpu 46% fault 0/188
real 17.95 user 0.02 sys 7.88 cpu 44% fault 0/188
real 17.55 user 0.02 sys 7.91 cpu 45% fault 0/188
# -s61440 -mread-send -c
real 17.43 user 0.01 sys 7.92 cpu 45% fault 0/187
real 17.42 user 0.03 sys 7.91 cpu 45% fault 0/188
real 17.39 user 0.01 sys 7.96 cpu 45% fault 0/187
# -s61440 -mread-send
real 17.17 user 0.01 sys 7.92 cpu 46% fault 0/187
real 17.31 user 0.01 sys 7.94 cpu 45% fault 0/188
real 16.99 user 0.00 sys 7.98 cpu 47% fault 0/188
# (stap)-s61440 -mread-send -q0
real 17.47 user 0.02 sys 8.23 cpu 47% fault 0/189
real 16.95 user 0.01 sys 8.26 cpu 48% fault 0/188
real 17.26 user 0.02 sys 8.25 cpu 47% fault 0/189
# -s61440 -mread-send -q0 -c
real 17.46 user 0.02 sys 8.28 cpu 47% fault 0/189
real 17.38 user 0.01 sys 8.26 cpu 47% fault 0/189
real 16.96 user 0.01 sys 8.28 cpu 48% fault 0/189
# -s61440 -mread-send -q0
real 17.26 user 0.02 sys 8.26 cpu 47% fault 0/189
real 16.54 user 0.02 sys 8.27 cpu 50% fault 0/189
real 17.27 user 0.00 sys 8.23 cpu 47% fault 0/189
# (stap)-s8900 -msend-sendfile
real 16.23 user 0.03 sys 4.71 cpu 29% fault 0/171
real 16.14 user 0.04 sys 4.71 cpu 29% fault 0/172
real 16.20 user 0.04 sys 4.71 cpu 29% fault 0/172
# -s8900 -msend-sendfile -c
real 15.91 user 0.03 sys 4.78 cpu 30% fault 0/172
real 16.10 user 0.06 sys 4.69 cpu 29% fault 0/172
real 16.20 user 0.04 sys 4.72 cpu 29% fault 0/172
# -s8900 -msend-sendfile
real 16.21 user 0.02 sys 4.72 cpu 29% fault 0/173
real 16.13 user 0.01 sys 4.70 cpu 29% fault 0/172
real 15.51 user 0.03 sys 4.72 cpu 30% fault 0/172
# (stap)-s8900 -msend-sendfile -q0
real 16.37 user 0.06 sys 4.88 cpu 30% fault 0/172
real 16.03 user 0.05 sys 4.88 cpu 30% fault 0/172
real 16.05 user 0.06 sys 4.90 cpu 30% fault 0/173
# -s8900 -msend-sendfile -q0 -c
real 15.90 user 0.05 sys 4.94 cpu 31% fault 0/171
real 15.79 user 0.04 sys 4.90 cpu 31% fault 0/172
real 15.87 user 0.07 sys 4.90 cpu 31% fault 0/172
# -s8900 -msend-sendfile -q0
real 15.88 user 0.06 sys 4.95 cpu 31% fault 0/171
real 16.34 user 0.06 sys 4.86 cpu 30% fault 0/173
real 16.01 user 0.06 sys 4.88 cpu 30% fault 0/173
# (stap)-s8900 -mread-send
real 17.24 user 0.06 sys 8.18 cpu 47% fault 0/176
real 17.02 user 0.06 sys 8.21 cpu 48% fault 0/175
real 17.10 user 0.06 sys 8.22 cpu 48% fault 0/175
# -s8900 -mread-send -c
real 17.11 user 0.04 sys 8.26 cpu 48% fault 0/175
real 17.14 user 0.06 sys 8.18 cpu 48% fault 0/176
real 17.53 user 0.07 sys 8.21 cpu 47% fault 0/177
# -s8900 -mread-send
real 17.69 user 0.05 sys 8.23 cpu 46% fault 0/176
real 17.39 user 0.10 sys 8.21 cpu 47% fault 0/176
real 17.43 user 0.04 sys 8.26 cpu 47% fault 0/177
# (stap)-s8900 -mread-send -q0
real 16.97 user 0.04 sys 8.15 cpu 48% fault 0/177
real 17.43 user 0.04 sys 8.23 cpu 47% fault 0/175
real 16.74 user 0.04 sys 8.21 cpu 49% fault 0/175
# -s8900 -mread-send -q0 -c
real 16.92 user 0.04 sys 8.22 cpu 48% fault 0/177
real 17.28 user 0.07 sys 8.22 cpu 47% fault 0/175
real 17.36 user 0.05 sys 8.22 cpu 47% fault 0/176
# -s8900 -mread-send -q0
real 17.32 user 0.08 sys 8.18 cpu 47% fault 0/177
real 17.11 user 0.05 sys 8.22 cpu 48% fault 0/176
real 16.75 user 0.03 sys 8.27 cpu 49% fault 0/177
