Usando .NET, ¿cómo se puede encontrar el tipo mime de un archivo según la firma del archivo, no la extensión?
Estoy buscando una manera sencilla de obtener un tipo MIME donde la extensión del archivo es incorrecta o no se proporciona, algo similar a esta pregunta solo en .Net.
Al final usé urlmon.dll. Pensé que habría una manera más fácil pero esto funciona. Incluyo el código para ayudar a cualquier otra persona y permitirme encontrarlo nuevamente si lo necesito.
using System.Runtime.InteropServices;
...
[DllImport(@"urlmon.dll", CharSet = CharSet.Auto)]
private extern static System.UInt32 FindMimeFromData(
System.UInt32 pBC,
[MarshalAs(UnmanagedType.LPStr)] System.String pwzUrl,
[MarshalAs(UnmanagedType.LPArray)] byte[] pBuffer,
System.UInt32 cbSize,
[MarshalAs(UnmanagedType.LPStr)] System.String pwzMimeProposed,
System.UInt32 dwMimeFlags,
out System.UInt32 ppwzMimeOut,
System.UInt32 dwReserverd
);
public static string getMimeFromFile(string filename)
{
if (!File.Exists(filename))
throw new FileNotFoundException(filename + " not found");
byte[] buffer = new byte[256];
using (FileStream fs = new FileStream(filename, FileMode.Open))
{
if (fs.Length >= 256)
fs.Read(buffer, 0, 256);
else
fs.Read(buffer, 0, (int)fs.Length);
}
try
{
System.UInt32 mimetype;
FindMimeFromData(0, null, buffer, 256, null, 0, out mimetype, 0);
System.IntPtr mimeTypePtr = new IntPtr(mimetype);
string mime = Marshal.PtrToStringUni(mimeTypePtr);
Marshal.FreeCoTaskMem(mimeTypePtr);
return mime;
}
catch (Exception e)
{
return "unknown/unknown";
}
}
Editar: solo usa Mime Detective
Utilizo secuencias de matrices de bytes para determinar el tipo MIME correcto de un archivo determinado. La ventaja de esto sobre simplemente mirar la extensión del nombre del archivo es que si un usuario cambiara el nombre de un archivo para evitar ciertas restricciones de carga de tipos de archivos, la extensión del nombre del archivo no detectaría esto. Por otro lado, obtener la firma del archivo mediante una matriz de bytes evitará que se produzca este comportamiento travieso.
Aquí hay un ejemplo en C#:
public class MimeType
{
private static readonly byte[] BMP = { 66, 77 };
private static readonly byte[] DOC = { 208, 207, 17, 224, 161, 177, 26, 225 };
private static readonly byte[] EXE_DLL = { 77, 90 };
private static readonly byte[] GIF = { 71, 73, 70, 56 };
private static readonly byte[] ICO = { 0, 0, 1, 0 };
private static readonly byte[] JPG = { 255, 216, 255 };
private static readonly byte[] MP3 = { 255, 251, 48 };
private static readonly byte[] OGG = { 79, 103, 103, 83, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0 };
private static readonly byte[] PDF = { 37, 80, 68, 70, 45, 49, 46 };
private static readonly byte[] PNG = { 137, 80, 78, 71, 13, 10, 26, 10, 0, 0, 0, 13, 73, 72, 68, 82 };
private static readonly byte[] RAR = { 82, 97, 114, 33, 26, 7, 0 };
private static readonly byte[] SWF = { 70, 87, 83 };
private static readonly byte[] TIFF = { 73, 73, 42, 0 };
private static readonly byte[] TORRENT = { 100, 56, 58, 97, 110, 110, 111, 117, 110, 99, 101 };
private static readonly byte[] TTF = { 0, 1, 0, 0, 0 };
private static readonly byte[] WAV_AVI = { 82, 73, 70, 70 };
private static readonly byte[] WMV_WMA = { 48, 38, 178, 117, 142, 102, 207, 17, 166, 217, 0, 170, 0, 98, 206, 108 };
private static readonly byte[] ZIP_DOCX = { 80, 75, 3, 4 };
public static string GetMimeType(byte[] file, string fileName)
{
string mime = "application/octet-stream"; //DEFAULT UNKNOWN MIME TYPE
//Ensure that the filename isn't empty or null
if (string.IsNullOrWhiteSpace(fileName))
{
return mime;
}
//Get the file extension
string extension = Path.GetExtension(fileName) == null
? string.Empty
: Path.GetExtension(fileName).ToUpper();
//Get the MIME Type
if (file.Take(2).SequenceEqual(BMP))
{
mime = "image/bmp";
}
else if (file.Take(8).SequenceEqual(DOC))
{
mime = "application/msword";
}
else if (file.Take(2).SequenceEqual(EXE_DLL))
{
mime = "application/x-msdownload"; //both use same mime type
}
else if (file.Take(4).SequenceEqual(GIF))
{
mime = "image/gif";
}
else if (file.Take(4).SequenceEqual(ICO))
{
mime = "image/x-icon";
}
else if (file.Take(3).SequenceEqual(JPG))
{
mime = "image/jpeg";
}
else if (file.Take(3).SequenceEqual(MP3))
{
mime = "audio/mpeg";
}
else if (file.Take(14).SequenceEqual(OGG))
{
if (extension == ".OGX")
{
mime = "application/ogg";
}
else if (extension == ".OGA")
{
mime = "audio/ogg";
}
else
{
mime = "video/ogg";
}
}
else if (file.Take(7).SequenceEqual(PDF))
{
mime = "application/pdf";
}
else if (file.Take(16).SequenceEqual(PNG))
{
mime = "image/png";
}
else if (file.Take(7).SequenceEqual(RAR))
{
mime = "application/x-rar-compressed";
}
else if (file.Take(3).SequenceEqual(SWF))
{
mime = "application/x-shockwave-flash";
}
else if (file.Take(4).SequenceEqual(TIFF))
{
mime = "image/tiff";
}
else if (file.Take(11).SequenceEqual(TORRENT))
{
mime = "application/x-bittorrent";
}
else if (file.Take(5).SequenceEqual(TTF))
{
mime = "application/x-font-ttf";
}
else if (file.Take(4).SequenceEqual(WAV_AVI))
{
mime = extension == ".AVI" ? "video/x-msvideo" : "audio/x-wav";
}
else if (file.Take(16).SequenceEqual(WMV_WMA))
{
mime = extension == ".WMA" ? "audio/x-ms-wma" : "video/x-ms-wmv";
}
else if (file.Take(4).SequenceEqual(ZIP_DOCX))
{
mime = extension == ".DOCX" ? "application/vnd.openxmlformats-officedocument.wordprocessingml.document" : "application/x-zip-compressed";
}
return mime;
}
}
Observe que manejé los tipos de archivos DOCX de manera diferente, ya que DOCX en realidad es solo un archivo ZIP. En este escenario, simplemente verifico la extensión del archivo una vez que verifiqué que tiene esa secuencia. Este ejemplo está lejos de ser completo para algunas personas, pero puedes agregar fácilmente el tuyo propio.
Si desea agregar más tipos MIME, puede obtener las secuencias de matriz de bytes de muchos tipos de archivos diferentes desde aquí . Además, aquí hay otro buen recurso sobre firmas de archivos.
Lo que hago muchas veces si todo lo demás falla es recorrer varios archivos de un tipo particular que estoy buscando y buscar un patrón en la secuencia de bytes de los archivos. Al final, esto sigue siendo una verificación básica y no se puede utilizar como prueba 100% para determinar los tipos de archivos.
Encontré una solución codificada, espero poder ayudar a alguien:
public static class MIMEAssistant
{
private static readonly Dictionary<string, string> MIMETypesDictionary = new Dictionary<string, string>
{
{"ai", "application/postscript"},
{"aif", "audio/x-aiff"},
{"aifc", "audio/x-aiff"},
{"aiff", "audio/x-aiff"},
{"asc", "text/plain"},
{"atom", "application/atom+xml"},
{"au", "audio/basic"},
{"avi", "video/x-msvideo"},
{"bcpio", "application/x-bcpio"},
{"bin", "application/octet-stream"},
{"bmp", "image/bmp"},
{"cdf", "application/x-netcdf"},
{"cgm", "image/cgm"},
{"class", "application/octet-stream"},
{"cpio", "application/x-cpio"},
{"cpt", "application/mac-compactpro"},
{"csh", "application/x-csh"},
{"css", "text/css"},
{"dcr", "application/x-director"},
{"dif", "video/x-dv"},
{"dir", "application/x-director"},
{"djv", "image/vnd.djvu"},
{"djvu", "image/vnd.djvu"},
{"dll", "application/octet-stream"},
{"dmg", "application/octet-stream"},
{"dms", "application/octet-stream"},
{"doc", "application/msword"},
{"docx","application/vnd.openxmlformats-officedocument.wordprocessingml.document"},
{"dotx", "application/vnd.openxmlformats-officedocument.wordprocessingml.template"},
{"docm","application/vnd.ms-word.document.macroEnabled.12"},
{"dotm","application/vnd.ms-word.template.macroEnabled.12"},
{"dtd", "application/xml-dtd"},
{"dv", "video/x-dv"},
{"dvi", "application/x-dvi"},
{"dxr", "application/x-director"},
{"eps", "application/postscript"},
{"etx", "text/x-setext"},
{"exe", "application/octet-stream"},
{"ez", "application/andrew-inset"},
{"gif", "image/gif"},
{"gram", "application/srgs"},
{"grxml", "application/srgs+xml"},
{"gtar", "application/x-gtar"},
{"hdf", "application/x-hdf"},
{"hqx", "application/mac-binhex40"},
{"htm", "text/html"},
{"html", "text/html"},
{"ice", "x-conference/x-cooltalk"},
{"ico", "image/x-icon"},
{"ics", "text/calendar"},
{"ief", "image/ief"},
{"ifb", "text/calendar"},
{"iges", "model/iges"},
{"igs", "model/iges"},
{"jnlp", "application/x-java-jnlp-file"},
{"jp2", "image/jp2"},
{"jpe", "image/jpeg"},
{"jpeg", "image/jpeg"},
{"jpg", "image/jpeg"},
{"js", "application/x-javascript"},
{"kar", "audio/midi"},
{"latex", "application/x-latex"},
{"lha", "application/octet-stream"},
{"lzh", "application/octet-stream"},
{"m3u", "audio/x-mpegurl"},
{"m4a", "audio/mp4a-latm"},
{"m4b", "audio/mp4a-latm"},
{"m4p", "audio/mp4a-latm"},
{"m4u", "video/vnd.mpegurl"},
{"m4v", "video/x-m4v"},
{"mac", "image/x-macpaint"},
{"man", "application/x-troff-man"},
{"mathml", "application/mathml+xml"},
{"me", "application/x-troff-me"},
{"mesh", "model/mesh"},
{"mid", "audio/midi"},
{"midi", "audio/midi"},
{"mif", "application/vnd.mif"},
{"mov", "video/quicktime"},
{"movie", "video/x-sgi-movie"},
{"mp2", "audio/mpeg"},
{"mp3", "audio/mpeg"},
{"mp4", "video/mp4"},
{"mpe", "video/mpeg"},
{"mpeg", "video/mpeg"},
{"mpg", "video/mpeg"},
{"mpga", "audio/mpeg"},
{"ms", "application/x-troff-ms"},
{"msh", "model/mesh"},
{"mxu", "video/vnd.mpegurl"},
{"nc", "application/x-netcdf"},
{"oda", "application/oda"},
{"ogg", "application/ogg"},
{"pbm", "image/x-portable-bitmap"},
{"pct", "image/pict"},
{"pdb", "chemical/x-pdb"},
{"pdf", "application/pdf"},
{"pgm", "image/x-portable-graymap"},
{"pgn", "application/x-chess-pgn"},
{"pic", "image/pict"},
{"pict", "image/pict"},
{"png", "image/png"},
{"pnm", "image/x-portable-anymap"},
{"pnt", "image/x-macpaint"},
{"pntg", "image/x-macpaint"},
{"ppm", "image/x-portable-pixmap"},
{"ppt", "application/vnd.ms-powerpoint"},
{"pptx","application/vnd.openxmlformats-officedocument.presentationml.presentation"},
{"potx","application/vnd.openxmlformats-officedocument.presentationml.template"},
{"ppsx","application/vnd.openxmlformats-officedocument.presentationml.slideshow"},
{"ppam","application/vnd.ms-powerpoint.addin.macroEnabled.12"},
{"pptm","application/vnd.ms-powerpoint.presentation.macroEnabled.12"},
{"potm","application/vnd.ms-powerpoint.template.macroEnabled.12"},
{"ppsm","application/vnd.ms-powerpoint.slideshow.macroEnabled.12"},
{"ps", "application/postscript"},
{"qt", "video/quicktime"},
{"qti", "image/x-quicktime"},
{"qtif", "image/x-quicktime"},
{"ra", "audio/x-pn-realaudio"},
{"ram", "audio/x-pn-realaudio"},
{"ras", "image/x-cmu-raster"},
{"rdf", "application/rdf+xml"},
{"rgb", "image/x-rgb"},
{"rm", "application/vnd.rn-realmedia"},
{"roff", "application/x-troff"},
{"rtf", "text/rtf"},
{"rtx", "text/richtext"},
{"sgm", "text/sgml"},
{"sgml", "text/sgml"},
{"sh", "application/x-sh"},
{"shar", "application/x-shar"},
{"silo", "model/mesh"},
{"sit", "application/x-stuffit"},
{"skd", "application/x-koan"},
{"skm", "application/x-koan"},
{"skp", "application/x-koan"},
{"skt", "application/x-koan"},
{"smi", "application/smil"},
{"smil", "application/smil"},
{"snd", "audio/basic"},
{"so", "application/octet-stream"},
{"spl", "application/x-futuresplash"},
{"src", "application/x-wais-source"},
{"sv4cpio", "application/x-sv4cpio"},
{"sv4crc", "application/x-sv4crc"},
{"svg", "image/svg+xml"},
{"swf", "application/x-shockwave-flash"},
{"t", "application/x-troff"},
{"tar", "application/x-tar"},
{"tcl", "application/x-tcl"},
{"tex", "application/x-tex"},
{"texi", "application/x-texinfo"},
{"texinfo", "application/x-texinfo"},
{"tif", "image/tiff"},
{"tiff", "image/tiff"},
{"tr", "application/x-troff"},
{"tsv", "text/tab-separated-values"},
{"txt", "text/plain"},
{"ustar", "application/x-ustar"},
{"vcd", "application/x-cdlink"},
{"vrml", "model/vrml"},
{"vxml", "application/voicexml+xml"},
{"wav", "audio/x-wav"},
{"wbmp", "image/vnd.wap.wbmp"},
{"wbmxl", "application/vnd.wap.wbxml"},
{"wml", "text/vnd.wap.wml"},
{"wmlc", "application/vnd.wap.wmlc"},
{"wmls", "text/vnd.wap.wmlscript"},
{"wmlsc", "application/vnd.wap.wmlscriptc"},
{"wrl", "model/vrml"},
{"xbm", "image/x-xbitmap"},
{"xht", "application/xhtml+xml"},
{"xhtml", "application/xhtml+xml"},
{"xls", "application/vnd.ms-excel"},
{"xml", "application/xml"},
{"xpm", "image/x-xpixmap"},
{"xsl", "application/xml"},
{"xlsx","application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"},
{"xltx","application/vnd.openxmlformats-officedocument.spreadsheetml.template"},
{"xlsm","application/vnd.ms-excel.sheet.macroEnabled.12"},
{"xltm","application/vnd.ms-excel.template.macroEnabled.12"},
{"xlam","application/vnd.ms-excel.addin.macroEnabled.12"},
{"xlsb","application/vnd.ms-excel.sheet.binary.macroEnabled.12"},
{"xslt", "application/xslt+xml"},
{"xul", "application/vnd.mozilla.xul+xml"},
{"xwd", "image/x-xwindowdump"},
{"xyz", "chemical/x-xyz"},
{"zip", "application/zip"}
};
public static string GetMIMEType(string fileName)
{
//get file extension
string extension = Path.GetExtension(fileName).ToLowerInvariant();
if (extension.Length > 0 &&
MIMETypesDictionary.ContainsKey(extension.Remove(0, 1)))
{
return MIMETypesDictionary[extension.Remove(0, 1)];
}
return "unknown/unknown";
}
}
En Urlmon.dll, hay una función llamada FindMimeFromData
.
De la documentación
La detección de tipo MIME, o "rastreo de datos", se refiere al proceso de determinar un tipo MIME apropiado a partir de datos binarios. El resultado final depende de una combinación de encabezados de tipo MIME proporcionados por el servidor, extensión de archivo y/o los datos mismos. Normalmente, sólo los primeros 256 bytes de datos son significativos.
Entonces, lea los primeros (hasta) 256 bytes del archivo y páselo a FindMimeFromData
.