Saturday, October 6, 2007

Binaries really faster than lists

Caoyuan did his second round of Wide Finder Project using lists, but expected lists traversing is faster than binary traversing. But he made little mistake. He assumed, that timer:tc(tbray, travel_list, [binary_to_list(Bin)]) including binary_to_list(Bin) time, but it is not true. binary_to_list(Bin) is done before timer:tc call and result passed as parameter. But there are more performance problems in his solution. He often use lists:reverse and ++ which twice reverse first parameter too. Bigger memory usage of list and often reversing must cause bad performance. When I tried write same algorithm using binary I take 3 times speed up.

-module(tbray2).

-compile([native, {hipe,[o3]}]).

-export([start/1, start/2]).

-record(context, {main,
                 dict,
                 chunkNum,
                 processedNum = 0}).

start(FileName) -> start(FileName, 1024*32).
start(FileName, ChunkSize) ->
   Start = now(),
   Main = self(),
   Collector = spawn_link(fun () -> collect_loop(#context{main = Main,
                                                     dict = dict:new(),
                                                     processedNum = 0}) end),
   ChunkNum = foreach_chunk(
           fun(Chunk) ->
               spawn_link(fun() -> Collector ! scan_lines(Chunk) end)
           end,
           FileName,
           ChunkSize
       ),
   Collector ! {chunkNum, ChunkNum},
  
   %% don't terminate, wait here, until all tasks done.
   receive
       stop -> io:format("Time: ~p ms~n", [timer:now_diff(now(), Start) / 1000])
   end.

foreach_chunk(Fun, FileName, SizeOfChunk) ->
   {ok, File} = file:open(FileName, [raw, binary]),
   N = foreach_chunk(Fun, File, <<>>, SizeOfChunk, 0),
   file:close(File),
   N.

foreach_chunk(Fun, File, PrevRest, SizeOfChunk, N) ->
   {Chunk, Rest} = read_chunk(File, PrevRest, SizeOfChunk),
   Fun(Chunk),
   case Rest of
       <<>> -> N+1;
       _ -> foreach_chunk(Fun, File, Rest, SizeOfChunk, N+1)
   end.

read_chunk(File, PrevRest, N) ->
   case file:read(File, N) of
       {ok, B} ->
           {Line, Rest} = split_on_nl(B),
           Chunk = <<PrevRest/binary, Line/binary>>,
           case Rest of
               <<>> ->
                   read_chunk(File, Chunk, N);
               _ ->
                   {Chunk, Rest}
           end;
       eof ->
           {PrevRest, <<>>}
   end.

split_on_nl(B) -> split_on_nl(B, 0, size(B)).

split_on_nl(B, N, S) when N < S -> case B of
       <<Line:N/binary, $\n, Tail/binary>> -> {Line, Tail};
       _ -> split_on_nl(B, N+1, S)
   end;
split_on_nl(B, _, _) -> {B, <<>>}.

collect_loop(#context{main=Main,
                     dict=Dict,
                     chunkNum=ChunkNum,
                     processedNum=ProcessedNum}=Context) ->
   case ProcessedNum of
       ChunkNum ->
           print_result(Dict),
           Main ! stop;
       _ ->
           receive
               {chunkNum, N} -> collect_loop(Context#context{chunkNum = N});
               DictX ->
                   collect_loop(Context#context{
                       dict = dict:merge(fun (_, V1, V2) -> V1 + V2 end, Dict, DictX),
                       processedNum = ProcessedNum+1})
           end
   end.

print_result(Dict) ->
   [R1, R2, R3, R4, R5, R6, R7, R8, R9, R10 | _] =
       lists:reverse(lists:keysort(2, dict:to_list(Dict))),
   lists:foreach(fun ({Word, Count}) ->
                         io:format("~p get requests for ~s~n", [Count, Word])
                 end, [R1, R2, R3, R4, R5, R6, R7, R8, R9, R10]).

scan_lines(Bin) -> scan_lines(Bin, dict:new()).
scan_lines(Bin, Dict) ->
   case naive_search(Bin) of
       {ok, Key, Rest} ->
           scan_lines(
               element(2, split_on_nl(Rest)),
               dict:update_counter(Key, 1, Dict));
       false -> Dict
   end.

%% naive_search(Binary()) -> false | {ok, Key, Rest}
naive_search(B) -> naive_search(B, 0, size(B)-18).

naive_search(B, N, S) when N < S ->
   case B of
       <<_:N/binary, "GET /ongoing/When/", Rest/binary>> ->
           case keyMatch(Rest) of
               Result = {ok, _Key, _Rest2} -> Result;
               false -> naive_search(Rest)
           end;
       _ -> naive_search(B, N+1, S)
   end;
naive_search(_, _, _) -> false.

%% keyMatch(Binary()) -> false | {ok, Key, Rest}
keyMatch(<<C, _/binary>>) when C == $ ; C == $. -> false;   % empty
keyMatch(B) -> keyMatch(B, 1, size(B)).

keyMatch(B, N, S) when N<S ->
   case B of
       % end with space
       <<Key:N/binary, $ , Rest/binary>> -> {ok, Key, Rest};
       <<_:N/binary, $., _/binary>> -> false;
       _ -> keyMatch(B, N+1, S)
   end;
keyMatch(_, _, _) -> false.
Result is less memory consuming and faster program. Problem is partitioned same way as Caoyuan did and I suppose that scale same way, but I can't test it because I don't have any multi core computer.

2 comments:

Caoyuan Deng said...

Hi, Pichi

I actually have another bin version, I didn't post it yet because I'll do more testing, then write sth. about List vs Bin in Erlang.

The binary version of mine and yours are almost same, and they all took about 49 seconds for same 200M file on my MacBook, so, were much slower than list version (7.4 seconds).

You are right for my timer:tc test for list, I also rewrote a travel_test and will post it too.

BR,

Pichi said...

Hi Deng,
I'm thinking why your lists version is faster than my measures. I don't have lists module compiled with HiPe! That's wrong. My default deb package installation don't made with HiPe and this cases my measure of your code is significantly slower then my and you measure opposite. Well, now I must find how to recompile all standard libraries in already installed erlang.

Pichi