文件类型判断工具类

yuyu888 于 2024-03-12 发布

做文件转换时, 通常需要准确的知道文件类型, 但是如果通过后缀名判断, 就不是那么准确, 网上也有那种通过读取文件头信息来判断文件类型的方法, 但是由于过于久远,当时的文件类型 可能还没那么丰富,头部定义相对简单,对于早期的文件类型判断相对准确,但是对于新涌现出来的文件类型, 判断就不那么准确了

这里提供了一个文件类型的判断方法,原理也是读取文件头信息,但是处理相对准确

pom.xml

		<dependency> 
		    <groupId>org.apache.tika</groupId> 
		    <artifactId>tika-core</artifactId> 
		    <version>2.9.0</version> 
		</dependency>
		<dependency>
			<groupId>org.apache.tika</groupId>
			<artifactId>tika-parsers-standard-package</artifactId>
			<version>2.9.0</version>
			<exclusions>
				<exclusion>
					<!-- 这个包会和jre中的java.xml包冲突,所以屏蔽掉 -->
					<groupId>xml-apis</groupId>
					<artifactId>xml-apis</artifactId>
				</exclusion>
			</exclusions>
		</dependency>

		<dependency>
			<groupId>org.apache.poi</groupId>
			<artifactId>poi</artifactId>
			<version>5.2.2</version>
		</dependency>
		<!-- poi-ooxml -->
		<dependency>
			<groupId>org.apache.poi</groupId>
			<artifactId>poi-ooxml</artifactId>
			<version>5.2.2</version>
		</dependency>

实现类

import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
import java.util.Map;
import java.util.Objects;

import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang.StringUtils;
import org.apache.tika.Tika;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.HttpHeaders;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.microsoft.OfficeParser;
import org.apache.tika.parser.ofd.OFDParser;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.SAXException;

import com.xxx.xxx.xxx.common.utils2.dto.FileContentTypeDTO;

/**
 * 对文件类型进行识别,根据返回对象中的accuracy值可以判断识别的精准度,100 的含义是100%精准
 * @author xxxxxx
 *
 */
@Slf4j
public class _FileContentTypeUtil {
	/**
	 * 仅依赖 tika-core包,可以快速精准检测出图片的content-type,不依赖文件后缀名
	 */
	private static Map<String, String> contentType_ImageMap = new HashMap();
	static {
		// 
		contentType_ImageMap.put("image/bmp", "bmp");
		contentType_ImageMap.put("image/gif", "gif");
		contentType_ImageMap.put("image/jpeg", "jpeg");
		contentType_ImageMap.put("image/png", "png");
		contentType_ImageMap.put("image/webp", "webp");
		contentType_ImageMap.put("image/heic", "heic");
		contentType_ImageMap.put("image/heif", "heif");
	}
	private static Map<String, String> contentType_LayoutDocumentMap = new HashMap();
	static {
		contentType_LayoutDocumentMap.put("application/pdf", "pdf");
	}
	/**
	 * 除了依赖 tika-core包,还依赖 tika-parsers-standard-package 包,才可以快速精准检测出新微软Office文档的content-type,不依赖文件后缀名
	 */
	private static Map<String, String> contentType_OoxmlDocumentMap = new HashMap();
	static {
		contentType_OoxmlDocumentMap.put("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "xlsx");
		contentType_OoxmlDocumentMap.put("application/vnd.openxmlformats-officedocument.wordprocessingml.document", "docx");
		contentType_OoxmlDocumentMap.put("application/vnd.openxmlformats-officedocument.presentationml.presentation", "pptx");
	}
	private static Map<String, String> contentType_OfficeDocumentMap = new HashMap();
	static {
		contentType_OfficeDocumentMap.put("application/msword", "doc");
		contentType_OfficeDocumentMap.put("application/vnd.ms-excel", "xls");
	}

	private static Map<String, String> contentType_ZipMap = new HashMap();
	static {
		contentType_ZipMap.put("application/ofd", "ofd");
	}
	
	//
	static String content_type_application_x_tika_ooxml = "application/x-tika-ooxml";
	static String content_type_application_x_tika_msoffice = "application/x-tika-msoffice";
	static String content_type_application_zip = "application/zip";

	public static FileContentTypeDTO getContentType(File file) {
		
		
		Tika tika = new Tika();
		try {
			String mimeType = tika.detect(file);
			if (contentType_ImageMap.containsKey(mimeType)) {
				return FileContentTypeDTO.builder()
						.contentType(mimeType)
						.fileExtName(contentType_ImageMap.get(mimeType))
						.accuracy(100)
						.build();
			} 
			if (contentType_LayoutDocumentMap.containsKey(mimeType)) {
				return FileContentTypeDTO.builder()
						.contentType(mimeType)
						.fileExtName(contentType_LayoutDocumentMap.get(mimeType))
						.accuracy(100)
						.build();
			} 
			
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		// 其他情况,复用 getContentType(InputStream is) 方法
		FileInputStream fis = null;
		try {
			fis = new FileInputStream(file);
			return getContentType(fis);
		} catch (FileNotFoundException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} finally {
			try {
				if(fis != null) {
					fis.close();
				}
			} catch (Exception e2) {
				// TODO: handle exception
			}
		}
		return null;
	}

	public static FileContentTypeDTO getContentType(InputStream is) {
		InputStream is1 = null, is2 = null;
		try {
			byte[] buffer = is.readAllBytes();

			is1 = new ByteArrayInputStream(buffer);
			is2 = new ByteArrayInputStream(buffer);

			Tika tika = new Tika();
			Metadata metadata = new Metadata();
			String mimeType = tika.detect(is1, metadata);

			if (contentType_ImageMap.containsKey(mimeType)) {
				return FileContentTypeDTO.builder()
						.contentType(mimeType)
						.fileExtName(contentType_ImageMap.get(mimeType))
						.accuracy(100)
						.build();
			}
			if (contentType_LayoutDocumentMap.containsKey(mimeType)) {
				return FileContentTypeDTO.builder()
						.contentType(mimeType)
						.fileExtName(contentType_LayoutDocumentMap.get(mimeType))
						.accuracy(100)
						.build();
			} 
			// 如果环境中有OOXML的解析器
			FileContentTypeDTO dto = parseOoxml(mimeType);
			if(Objects.nonNull(dto)) {
				return dto;
			}
			// 
			if(StringUtils.equalsIgnoreCase(mimeType, content_type_application_x_tika_msoffice)) {
				FileContentTypeDTO dto2 = parseMetadata(is2, new OfficeParser(), contentType_OfficeDocumentMap);
				if(Objects.nonNull(dto2)) {
					return dto2;
				} 
			}
			//
			if(StringUtils.equalsIgnoreCase(mimeType, content_type_application_zip)) {
				FileContentTypeDTO dto2 = parseMetadata(is2, new OFDParser(), contentType_ZipMap);
				if(Objects.nonNull(dto2)) {
					return dto2;
				} 
			}
			System.out.println("-------------------------------得到的ContentType可能不准仅供参考,得到的直接ContentType是 = " + mimeType);
			
			return FileContentTypeDTO.builder()
					.contentType(mimeType)
					.accuracy(50)
					.build();
		} catch (IOException e) {
			e.printStackTrace();
		} finally {
			try {
				if(is!=null) {
					is.close();
				}
			} catch (IOException e) {
				e.printStackTrace();
			}
			try {
				if(is1!=null) {
					is1.close();
				}
			} catch (IOException e) {
				e.printStackTrace();
			}
			try {
				if(is2!=null) {
					is2.close();
				}
			} catch (IOException e) {
				e.printStackTrace();
			}
		}
		return null;
	}

	private static FileContentTypeDTO parseOoxml(String mimeType) {
		if(StringUtils.equalsIgnoreCase(mimeType, content_type_application_x_tika_ooxml)) {
			if (contentType_OoxmlDocumentMap.containsKey(mimeType)) {
				return FileContentTypeDTO.builder()
						.contentType(mimeType)
						.fileExtName(contentType_OoxmlDocumentMap.get(mimeType))
						.accuracy(100)
						.build();
			}
		}
		return null;
	}

	private static FileContentTypeDTO parseMetadata(InputStream inputstream, Parser parser, Map<String ,String> map) {
		BodyContentHandler handler = new BodyContentHandler();
		Metadata metadata = new Metadata(); // empty metadata object
		ParseContext context = new ParseContext();
		try {
			parser.parse(inputstream, handler, metadata, context);
//			System.out.println("-=====================");
//			System.out.println(handler.toString());
//		      //getting the list of all meta data elements 
//		      String[] metadataNames = metadata.names();
//		      for(String name : metadataNames) {		        
//		         System.out.println(name + " : " + metadata.get(name));
//		      }
//
		} catch (IOException e) {
			e.printStackTrace();
		} catch (TikaException e) {
			e.printStackTrace();
		} catch (SAXException e) {
			e.printStackTrace();
		}catch(Throwable t){
			log.info("===============888888888=============");
			t.printStackTrace();
		} finally{
			try {
				if(inputstream != null) {
					inputstream.close();
				}
			} catch (IOException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}
		}
		
		String mimeType = metadata.get(HttpHeaders.CONTENT_TYPE);
		System.out.println("---------- metadata mimeType is " + mimeType);
		if (map.containsKey(mimeType)) {
			return FileContentTypeDTO.builder()
					.contentType(mimeType)
					.fileExtName(map.get(mimeType))
					.accuracy(100)
					.build();
		}
		return null;
	}
}

FileContentTypeDTO

@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class FileContentTypeDTO {

	/**
	 * 文件 content-type
	 */
	@JsonProperty("content_type")
	private String contentType;
	
	/**
	 * 文件扩展名
	 */
	@JsonProperty("file_ext_name")
	private String fileExtName;

	/**
	 * content-type的精确度,100 表示 100% 准确,其他值表示 不保证一定准确
	 */
	private Integer accuracy;
	
	/**
	 * 是否100%精准
	 */
	@JsonProperty("accuracy100_percent")
	private boolean accuracy100Percent;
	
	public boolean getAccuracy100Percent() {
		return (accuracy != null && accuracy == 100)?true: false;
	}
}