6> string:substr("abcd我们就是喜欢Erlang,就是喜欢OTP",4,3). [100,25105,20204] 7> io:format("~ts",[v(6)]). d我们ok
Eshell V5.10.2 (abort with ^G) 1> u:sub(). [100,230,136] 2> io:format("~ts",[v(1)]). dæok 3> q(). ok 4>
%% coding: utf-8
Eshell V5.10.2 (abort with ^G) 1> u:sub(). [100,25105,20204] 2> io:format("~ts",[v(1)]). d我们ok
-define(DEFAULT_ENCODING, latin1). -spec default_encoding() -> source_encoding(). default_encoding() -> ?DEFAULT_ENCODING. -spec encoding_to_string(Encoding) -> string() when Encoding :: source_encoding(). encoding_to_string(latin1) -> "coding: latin-1"; encoding_to_string(utf8) -> "coding: utf-8".
The Erlang source file encoding is selected by a comment in one of the first two lines of the source file. The first string that matches the regular expression coding\s*[:=]\s*([-a-zA-Z0-9])+ selects the encoding. If the matching string is not a valid encoding it is ignored. The valid encodings are Latin-1 and UTF-8 where the case of the characters can be chosen freely.
As of Erlang/OTP R16 Erlang source files can be written in either UTF-8 or bytewise encoding (a.k.a. latin1 encoding). The details on how to state the encoding of an Erlang source file can be found in epp(3). Strings and comments can be written using Unicode, but functions still have to be named using characters from the ISO-latin-1 character set and atoms are restricted to the same ISO-latin-1 range. These restrictions in the language are of course independent of the encoding of the source file. Erlang/OTP R18 is expected to handle functions named in Unicode as well as Unicode atoms. http://www.erlang.org/doc/apps/stdlib/unicode_usage.html
-module(u). -compile(export_all). test() -> t("abcd我们就是喜欢Erlang,就是喜欢OTP",10). test2() -> tw("Youth is not a time of life; it is a state of mind; it is not a matter of rosy cheeks, red lips and supple knees; it is a matter of the will, a quality of the imagination, a vigor of the emotions; it is the freshness of the deep springs of life.",10). dump(FileName,Data)-> file:write_file(FileName, io_lib:fwrite("~s.\n", [Data])). sub()-> string:substr("abcd我们就是喜欢Erlang,就是喜欢OTP",4,3). t(Input,Max) -> truncatechars(Input,Max). tw(Input,Max) -> truncatewords(Input,Max). %% @doc Truncates a string after a certain number of characters. truncatechars(_Input, Max) when Max =< 0 -> ""; truncatechars(Input, Max) when is_binary(Input) -> list_to_binary(truncatechars(binary_to_list(Input), Max)); truncatechars(Input, Max) -> truncatechars(Input, Max, []). %% @doc Truncates a string after a certain number of words. truncatewords(_Input, Max) when Max =< 0 -> ""; truncatewords(Input, Max) when is_binary(Input) -> list_to_binary(truncatewords(binary_to_list(Input), Max)); truncatewords(Input, Max) -> truncatewords(Input, Max, []). truncatechars([], _CharsLeft, Acc) -> lists:reverse(Acc); truncatechars(_Input, 0, Acc) -> lists:reverse("..." ++ Acc); truncatechars([C|Rest], CharsLeft, Acc) when C >= 2#11111100 -> truncatechars(Rest, CharsLeft + 4, [C|Acc]); truncatechars([C|Rest], CharsLeft, Acc) when C >= 2#11111000 -> truncatechars(Rest, CharsLeft + 3, [C|Acc]); truncatechars([C|Rest], CharsLeft, Acc) when C >= 2#11110000 -> truncatechars(Rest, CharsLeft + 2, [C|Acc]); truncatechars([C|Rest], CharsLeft, Acc) when C >= 2#11100000 -> truncatechars(Rest, CharsLeft + 1, [C|Acc]); truncatechars([C|Rest], CharsLeft, Acc) when C >= 2#11000000 -> truncatechars(Rest, CharsLeft, [C|Acc]); truncatechars([C|Rest], CharsLeft, Acc) -> truncatechars(Rest, CharsLeft - 1, [C|Acc]). truncatewords(Value, _WordsLeft, _Acc) when is_atom(Value) -> Value; truncatewords([], _WordsLeft, Acc) -> lists:reverse(Acc); truncatewords(_Input, 0, Acc) -> lists:reverse("..." ++ Acc); truncatewords([C1, C2|Rest], WordsLeft, Acc) when C1 =/= $\ andalso C2 =:= $\ -> truncatewords([C2|Rest], WordsLeft - 1, [C1|Acc]); truncatewords([C1|Rest], WordsLeft, Acc) -> truncatewords(Rest, WordsLeft, [C1|Acc]).
test() -> t("abcd我们就是喜欢Erlang,就是喜欢OTP",10). dump(FileName,Data)-> file:write_file(FileName, io_lib:fwrite("~s.\n", [Data])). Eshell V5.10.2 (abort with ^G) 1> u:test(). [97,98,99,100,230,136,145,228,187,172,229,176,177,230,152, 175,229,150,156,230,172,162,46,46,46] 2> 2> u:dump("u_result",v(1)). ok 3>
[root@nimbus demo]# cat u_result
abcd我们就是喜欢....
Unicode编码(16进制) | UTF-8 字节流模板 |
000000 - 00007F | 0xxxxxxx |
000080 - 0007FF | 110xxxxx 10xxxxxx |
000800 - 00FFFF | 1110xxxx 10xxxxxx 10xxxxxx |
010000 - 10FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx |
Eshell V5.10.2 (abort with ^G) 1> unicode:characters_to_binary("开心"). <<229,188,128,229,191,131>> 2> unicode:characters_to_list("开心"). [24320,24515] 3> integer_to_list(24320,2). "101111100000000" 4> integer_to_list(24515,2). "101111111000011" 5> integer_to_list(23383,2). "101101101010111"