RaUI/Source/MyDb/FileFuns/TxtFileEncoder.cs
zilinsoft 3262955f2f ### 2023-11-07更新
------
#### RaUIV4    V4.0.2311.0701
- *.[全新]整合了MyDb、ryControls、MyDb_MySQL等dll文件到RaUI一个项目。
- *.[新增]新增ApkOp类,可以轻松获取APK信息。
- *.[新增]新增JsonExt扩展类,让Json操作更简单。
- *.[新增]新增WebP类,可以支持webp格式的图片。
- *.[改进]ryQuickSQL中的AddField方法改为自动替换已存在的同名值。
- *.[修复]ryQuickSQL中的AddFieldCalc方法无法正常计算的BUG。
2023-11-07 16:37:53 +08:00

273 lines
9.4 KiB
C#
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
namespace ryCommon
{
/// <summary>
/// 用于取得一个文本文件的编码方式(Encoding)。
/// </summary>
public class TxtFileEncoder
{
/// <summary>
/// 用于取得一个文本文件的编码方式(Encoding)。
/// </summary>
public TxtFileEncoder()
{
//
// TODO: 在此处添加构造函数逻辑
//
}
/// <summary>
/// 取得一个文本文件的编码方式。如果无法在文件头部找到有效的前导符Encoding.Default将被返回。
/// </summary>
/// <param name="fileName">文件名。</param>
/// <returns></returns>
public static Encoding GetEncoding(string fileName)
{
return GetEncoding(fileName, Encoding.Default);
}
/// <summary>
/// 取得一个文本文件流的编码方式。
/// </summary>
/// <param name="stream">文本文件流。</param>
/// <returns></returns>
public static Encoding GetEncoding(FileStream stream)
{
return GetEncoding(stream, Encoding.Default);
}
/// <summary>
/// 取得一个文本文件的编码方式。
/// </summary>
/// <param name="fileName">文件名。</param>
/// <param name="defaultEncoding">默认编码方式。当该方法无法从文件的头部取得有效的前导符时,将返回该编码方式。</param>
/// <returns></returns>
public static Encoding GetEncoding(string fileName, Encoding defaultEncoding)
{
Encoding targetEncoding = defaultEncoding;
try
{
FileStream fs = new FileStream(fileName, FileMode.Open, FileAccess.Read);
try
{
targetEncoding = GetEncoding(fs, defaultEncoding);
}
catch { }
fs.Close();
}
catch
{
return defaultEncoding;
}
return targetEncoding;
}
/// <summary>
/// 取得一个文本文件流的编码方式。
/// </summary>
/// <param name="stream">文本文件流。</param>
/// <param name="defaultEncoding">默认编码方式。当该方法无法从文件的头部取得有效的前导符时,将返回该编码方式。</param>
/// <returns></returns>
public static Encoding GetEncoding(FileStream stream, Encoding defaultEncoding)
{
Encoding targetEncoding = defaultEncoding;
if (stream != null && stream.Length >= 2)
{
//保存文件流的前4个字节
byte byte1;
byte byte2;
byte byte3 = 0;
byte byte4;
BinaryReader r = new BinaryReader(stream, System.Text.Encoding.Default);
byte[] ss;
ss = r.ReadBytes(stream.Length.ToInt());
byte1 = ss.Length>=1?ss[0]:byte.MinValue;
byte2 = ss.Length >= 2 ? ss[1] : byte.MinValue;
if (stream.Length >= 3)
{
byte3 = ss.Length >= 3 ? ss[2] : byte.MinValue;
}
if (stream.Length >= 4)
{
byte4 = ss.Length >= 4 ? ss[3] : byte.MinValue;
}
//根据文件流的前4个字节判断Encoding
//Unicode {0xFF, 0xFE};
//BE-Unicode {0xFE, 0xFF};
//UTF8 = {0xEF, 0xBB, 0xBF};
if (byte1 == 0xFE && byte2 == 0xFF)//UnicodeBe
{
targetEncoding = Encoding.BigEndianUnicode;
}
else if (byte1 == 0xFF && byte2 == 0xFE && byte3 != 0xFF)//Unicode
{
targetEncoding = Encoding.Unicode;
}
else if (byte1 == 0xEF && byte2 == 0xBB && byte3 == 0xBF)//UTF8
{
targetEncoding = Encoding.UTF8;
}
else if (IsUTF8Bytes(ss))//UTF8
{
targetEncoding =new UTF8Encoding(false);
}
//恢复Seek位置
r.Close();
}
return targetEncoding;
}
// 新增加一个方法解决了不带BOM的 UTF8 编码问题
/// <summary>
/// 通过给定的文件流,判断文件的编码类型
/// </summary>
/// <param name="fs">文件流</param>
/// <returns>文件的编码类型</returns>
public static System.Text.Encoding GetEncoding(Stream fs)
{
byte[] Unicode = new byte[] { 0xFF, 0xFE, 0x41 };
byte[] UnicodeBIG = new byte[] { 0xFE, 0xFF, 0x00 };
byte[] UTF8 = new byte[] { 0xEF, 0xBB, 0xBF }; //带BOM
Encoding reVal = Encoding.Default;
BinaryReader r = new BinaryReader(fs, System.Text.Encoding.Default);
byte[] ss = r.ReadBytes(4);
if (ss[0] == 0xFE && ss[1] == 0xFF && ss[2] == 0x00)
{
reVal = Encoding.BigEndianUnicode;
}
else if (ss[0] == 0xFF && ss[1] == 0xFE && ss[2] == 0x41)
{
reVal = Encoding.Unicode;
}
else
{
if (ss[0] == 0xEF && ss[1] == 0xBB && ss[2] == 0xBF)
{
reVal = Encoding.UTF8;
}
else
{
int.TryParse(fs.Length.ToString(), out int i);
ss = r.ReadBytes(i);
if (GetBytesEncoding(ss)==Encoding.UTF8)
reVal = Encoding.UTF8;
}
}
r.Close();
return reVal;
}
private static Encoding GetBytesEncoding(byte[] bs)
{
int len = bs.Length;
if (len >= 3 && bs[0] == 0xEF && bs[1] == 0xBB && bs[2] == 0xBF)
{
return Encoding.UTF8;
}
int[] cs = { 7, 5, 4, 3, 2, 1, 0, 6, 14, 30, 62, 126 };
for (int i = 0; i < len; i++)
{
int bits = -1;
for (int j = 0; j < 6; j++)
{
if (bs[i] >> cs[j] == cs[j + 6])
{
bits = j;
break;
}
}
if (bits == -1)
{
return Encoding.Default;
}
while (bits-- > 0)
{
i++;
if (i == len || bs[i] >> 6 != 2)
{
return Encoding.Default;
}
}
}
return Encoding.UTF8;
}
private static byte UTF8CharacterMask1Byte = 0b1000_0000;
private static byte Valid1Byte = 0b0000_0000;//0b0xxx_xxxx
private static byte UTF8CharacterMask2Byte = 0b1110_0000;
private static byte Valid2Byte = 0b1100_0000;//0b110x_xxxx
private static byte UTF8CharacterMask3Byte = 0b1111_0000;
private static byte Valid3Byte = 0b1110_0000;//0b1110_xxxx
private static byte UTF8CharacterMask4Byte = 0b1111_1000;
private static byte Valid4Byte = 0b1111_0000;//0b1111_0xxx
private static byte UTF8CharacterMaskForExtraByte = 0b1100_0000;
private static byte ValidExtraByte = 0b1000_0000;//0b10xx_xxxx
/// <summary>
/// 判断是否是不带 BOM 的 UTF8 格式
/// </summary>
/// <param name="bytes"></param>
/// <returns></returns>
public static bool IsUTF8Bytes(byte[] bytes)
{
short extraByteCount = 0;
foreach (byte bt in bytes)
{
if (extraByteCount > 0)
{
extraByteCount--;
// Extra Byte Pattern.
if ((bt & UTF8CharacterMaskForExtraByte) != ValidExtraByte)
return false;
continue;
}
else
{
// 1 Byte Pattern.
if ((bt & UTF8CharacterMask1Byte) == Valid1Byte)
{
continue;
}
// 2 Bytes Pattern.
if ((bt & UTF8CharacterMask2Byte) == Valid2Byte)
{
extraByteCount = 1;
continue;
}
// 3 Bytes Pattern.
if ((bt & UTF8CharacterMask3Byte) == Valid3Byte)
{
extraByteCount = 2;
continue;
}
// 4 Bytes Pattern.
if ((bt & UTF8CharacterMask4Byte) == Valid4Byte)
{
extraByteCount = 3;
continue;
}
// invalid UTF8-Bytes.
return false;
}
}
return extraByteCount >= 0;
}
}
}