一、编码范围
1.GBK(GB2312/GB18030)
x00-xffGBK双字节编码范围
x20-x7fASCII
xa1-xff中文
x80-xff中文
2.UTF-8(Unicode)
u4e00-u9fa5(中文)
x3130-x318F(韩文
xAC00-xD7A3(韩文)
u0800-u4e00(日文)
ps:韩文是大于[u9fa5]的字符
正则例子:
PLAINTEXT
PHP:
preg_replace("/([x80-xff])/","",$str);
preg_replace("/([u4e00-u9fa5])/","",$str);
二、代码例子
PLAINTEXT
PHP:
//判断内容里有没有中文-GBK(PHP)
functioncheck_is_chinese($s){
returnpreg_match('/[x80-xff]./',$s);
}
//获取字符串长度-GBK(PHP)
functiongb_strlen($str){
$count=0;
for($i=0;$i<strlen($str);$i++){
$s=substr($str,$i,1);
if(preg_match("/[x80-xff]/",$s))++$i;
++$count;
}
return$count;
}
//截取字符串字串-GBK(PHP)
functiongb_substr($str,$len){
$count=0;
for($i=0;$i<strlen($str);$i++){
if($count==$len)break;
if(preg_match("/[x80-xff]/",substr($str,$i,1)))++$i;
++$count;
}
returnsubstr($str,0,$i);
}
//统计字符串长度-UTF8(PHP)
functionutf8_strlen($str){
$count=0;
for($i=0;$i<strlen($str);$i++){
$value=ord($str[$i]);
if($value>127){
$count++;
if($value>=192&&$value<=223)$i++;
elseif($value>=224&&$value<=239)$i=$i+2;
elseif($value>=240&&$value<=247)$i=$i+3;
elsedie('NotaUTF-8compatiblestring');
}
$count++;
}
return$count;
}
//截取字符串-UTF8(PHP)
functionutf8_substr($str,$position,$length){
$start_position=strlen($str);
$start_byte=0;
$end_position=strlen($str);
$count=0;
for($i=0;$i<strlen($str);$i++){
if($count>=$position&&$start_position>$i){
$start_position=$i;
$start_byte=$count;
}
if(($count-$start_byte)>=$length){
$end_position=$i;
break;
}
$value=ord($str[$i]);
if($value>127){
$count++;
if($value>=192&&$value<=223)$i++;
elseif($value>=224&&$value<=239)$i=$i+2;
elseif($value>=240&&$value<=247)$i=$i+3;
elsedie('NotaUTF-8compatiblestring');
}
$count++;
}
return(substr($str,$start_position,$end_position-$start_position));
}
//字符串长度统计-UTF8[中文3个字节
,俄文、韩文占2个字节
,字母占1个字节](Ruby)
defutf8_string_length(str)
temp=CGI::unescape(str)
i=0;
j=0;
temp.length.times{|t|
iftemp[t]<127
i+=1
elseiftemp[t]>=127andtemp[t]<224
j+=1
if0==(j%2)
i+=2
j=0
end
else
j+=1
if0==(j%3)
i+=2
j=0
end
end
}
returni
}
//判断是否是有韩文-UTF-8(javascript)
functioncheckKoreaChar(str){
for(i=0;i<str.length;i++){
if(((str.charCodeAt(i)>0x3130&&str.charCodeAt(i)<0x318F)||(str.charCodeAt(i)>=0xAC00&&str.charCodeAt(i)<=0xD7A3))){
returntrue;
}
}
returnfalse;
}
//判断是否有中文字符-GBK(javascript)
functioncheck_chinese_char(s){
return(s.length!=s.replace(/[^x00-xff]/g,"**").length);
}