RaUI/Source/MyDb/FileFuns/TxtFileEncoder.cs
鑫Intel c3d4ddf574 ### 2021-07-29更新
------
#### MyDbV4   V3.0.2107.2901
- *.[新增]新增支持计算文件MD5。
- *.[新增]部分DataProvider功能移植到DbExtension里,增加扩展性。
- *.[新增]UnixTimeToDateTime和JSTimeToDateTime新增支持long参数。
- *.[合并]合并RyWeb项目到MyDb里。

#### ryControlsV4    V3.0.2107.2901
  -  *.[改进]优化减少大量IDE警告和消息。
2021-07-29 17:09:32 +08:00

256 lines
9.1 KiB
C#
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
namespace ryCommon
{
/// <summary>
/// 用于取得一个文本文件的编码方式(Encoding)。
/// </summary>
public class TxtFileEncoder
{
/// <summary>
/// 用于取得一个文本文件的编码方式(Encoding)。
/// </summary>
public TxtFileEncoder()
{
//
// TODO: 在此处添加构造函数逻辑
//
}
/// <summary>
/// 取得一个文本文件的编码方式。如果无法在文件头部找到有效的前导符Encoding.Default将被返回。
/// </summary>
/// <param name="fileName">文件名。</param>
/// <returns></returns>
public static Encoding GetEncoding(string fileName)
{
return GetEncoding(fileName, Encoding.Default);
}
/// <summary>
/// 取得一个文本文件流的编码方式。
/// </summary>
/// <param name="stream">文本文件流。</param>
/// <returns></returns>
public static Encoding GetEncoding(FileStream stream)
{
return GetEncoding(stream, Encoding.Default);
}
/// <summary>
/// 取得一个文本文件的编码方式。
/// </summary>
/// <param name="fileName">文件名。</param>
/// <param name="defaultEncoding">默认编码方式。当该方法无法从文件的头部取得有效的前导符时,将返回该编码方式。</param>
/// <returns></returns>
public static Encoding GetEncoding(string fileName, Encoding defaultEncoding)
{
Encoding targetEncoding = defaultEncoding;
try
{
FileStream fs = new FileStream(fileName, FileMode.Open, FileAccess.Read);
try
{
targetEncoding = GetEncoding(fs, defaultEncoding);
}
catch { }
fs.Close();
}
catch
{
return defaultEncoding;
}
return targetEncoding;
}
/// <summary>
/// 取得一个文本文件流的编码方式。
/// </summary>
/// <param name="stream">文本文件流。</param>
/// <param name="defaultEncoding">默认编码方式。当该方法无法从文件的头部取得有效的前导符时,将返回该编码方式。</param>
/// <returns></returns>
public static Encoding GetEncoding(FileStream stream, Encoding defaultEncoding)
{
Encoding targetEncoding = defaultEncoding;
if (stream != null && stream.Length >= 2)
{
//保存文件流的前4个字节
byte byte1;
byte byte2;
byte byte3 = 0;
byte byte4;
//保存当前Seek位置
long origPos = stream.Seek(0, SeekOrigin.Begin);
stream.Seek(0, SeekOrigin.Begin);
int nByte = stream.ReadByte();
byte1 = Convert.ToByte(nByte);
byte2 = Convert.ToByte(stream.ReadByte());
if (stream.Length >= 3)
{
byte3 = Convert.ToByte(stream.ReadByte());
}
if (stream.Length >= 4)
{
byte4 = Convert.ToByte(stream.ReadByte());
}
//根据文件流的前4个字节判断Encoding
//Unicode {0xFF, 0xFE};
//BE-Unicode {0xFE, 0xFF};
//UTF8 = {0xEF, 0xBB, 0xBF};
if (byte1 == 0xFE && byte2 == 0xFF)//UnicodeBe
{
targetEncoding = Encoding.BigEndianUnicode;
}
else if (byte1 == 0xFF && byte2 == 0xFE && byte3 != 0xFF)//Unicode
{
targetEncoding = Encoding.Unicode;
}
else if (byte1 == 0xEF && byte2 == 0xBB && byte3 == 0xBF)//UTF8
{
targetEncoding = Encoding.UTF8;
}
else
{
int index =0;
byte[] bytes = new byte[100];
stream.Seek(origPos, SeekOrigin.Begin);
while (stream.CanRead)
{
index++; if (index >= 100) { break; }
bytes[index - 1] = Convert.ToByte(stream.ReadByte());
}
if (GetBytesEncoding(bytes)==Encoding.UTF8) { targetEncoding = Encoding.UTF8; }
}
//恢复Seek位置
stream.Seek(origPos, SeekOrigin.Begin);
}
return targetEncoding;
}
// 新增加一个方法解决了不带BOM的 UTF8 编码问题
/// <summary>
/// 通过给定的文件流,判断文件的编码类型
/// </summary>
/// <param name="fs">文件流</param>
/// <returns>文件的编码类型</returns>
public static System.Text.Encoding GetEncoding(Stream fs)
{
byte[] Unicode = new byte[] { 0xFF, 0xFE, 0x41 };
byte[] UnicodeBIG = new byte[] { 0xFE, 0xFF, 0x00 };
byte[] UTF8 = new byte[] { 0xEF, 0xBB, 0xBF }; //带BOM
Encoding reVal = Encoding.Default;
BinaryReader r = new BinaryReader(fs, System.Text.Encoding.Default);
byte[] ss = r.ReadBytes(4);
if (ss[0] == 0xFE && ss[1] == 0xFF && ss[2] == 0x00)
{
reVal = Encoding.BigEndianUnicode;
}
else if (ss[0] == 0xFF && ss[1] == 0xFE && ss[2] == 0x41)
{
reVal = Encoding.Unicode;
}
else
{
if (ss[0] == 0xEF && ss[1] == 0xBB && ss[2] == 0xBF)
{
reVal = Encoding.UTF8;
}
else
{
int.TryParse(fs.Length.ToString(), out int i);
ss = r.ReadBytes(i);
if (GetBytesEncoding(ss)==Encoding.UTF8)
reVal = Encoding.UTF8;
}
}
r.Close();
return reVal;
}
private static Encoding GetBytesEncoding(byte[] bs)
{
int len = bs.Length;
if (len >= 3 && bs[0] == 0xEF && bs[1] == 0xBB && bs[2] == 0xBF)
{
return Encoding.UTF8;
}
int[] cs = { 7, 5, 4, 3, 2, 1, 0, 6, 14, 30, 62, 126 };
for (int i = 0; i < len; i++)
{
int bits = -1;
for (int j = 0; j < 6; j++)
{
if (bs[i] >> cs[j] == cs[j + 6])
{
bits = j;
break;
}
}
if (bits == -1)
{
return Encoding.Default;
}
while (bits-- > 0)
{
i++;
if (i == len || bs[i] >> 6 != 2)
{
return Encoding.Default;
}
}
}
return Encoding.UTF8;
}
/// <summary>
/// 判断是否是不带 BOM 的 UTF8 格式
/// </summary>
/// <param name="data"></param>
/// <returns></returns>
public static bool IsUTF8Bytes(byte[] data)
{
int charByteCounter = 1;  //计算当前正分析的字符应还有的字节数
byte curByte; //当前分析的字节.
for (int i = 0; i < data.Length; i++)
{
curByte = data[i];
if (charByteCounter == 1)
{
if (curByte >= 0x80)
{
//判断当前
while (((curByte <<= 1) & 0x80) != 0)
{
charByteCounter++;
}
//标记位首位若为非0 则至少以2个1开始 如:110XXXXX...........1111110X 
if (charByteCounter == 1 || charByteCounter > 6)
{
return false;
}
}
}
else
{
//若是UTF-8 此时第一位必须为1
if ((curByte & 0xC0) != 0x80)
{
return false;
}
charByteCounter--;
}
}
if (charByteCounter > 1)
{
throw new Exception("非预期的byte格式!");
}
return true;
}
}
}