做文件转换时, 通常需要准确的知道文件类型, 但是如果通过后缀名判断, 就不是那么准确, 网上也有那种通过读取文件头信息来判断文件类型的方法, 但是由于过于久远,当时的文件类型 可能还没那么丰富,头部定义相对简单,对于早期的文件类型判断相对准确,但是对于新涌现出来的文件类型, 判断就不那么准确了
这里提供了一个文件类型的判断方法,原理也是读取文件头信息,但是处理相对准确
pom.xml
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-core</artifactId>
<version>2.9.0</version>
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parsers-standard-package</artifactId>
<version>2.9.0</version>
<exclusions>
<exclusion>
<!-- 这个包会和jre中的java.xml包冲突,所以屏蔽掉 -->
<groupId>xml-apis</groupId>
<artifactId>xml-apis</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>5.2.2</version>
</dependency>
<!-- poi-ooxml -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>5.2.2</version>
</dependency>
实现类
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
import java.util.Map;
import java.util.Objects;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang.StringUtils;
import org.apache.tika.Tika;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.HttpHeaders;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.microsoft.OfficeParser;
import org.apache.tika.parser.ofd.OFDParser;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.SAXException;
import com.xxx.xxx.xxx.common.utils2.dto.FileContentTypeDTO;
/**
* 对文件类型进行识别,根据返回对象中的accuracy值可以判断识别的精准度,100 的含义是100%精准
* @author xxxxxx
*
*/
@Slf4j
public class _FileContentTypeUtil {
/**
* 仅依赖 tika-core包,可以快速精准检测出图片的content-type,不依赖文件后缀名
*/
private static Map<String, String> contentType_ImageMap = new HashMap();
static {
//
contentType_ImageMap.put("image/bmp", "bmp");
contentType_ImageMap.put("image/gif", "gif");
contentType_ImageMap.put("image/jpeg", "jpeg");
contentType_ImageMap.put("image/png", "png");
contentType_ImageMap.put("image/webp", "webp");
contentType_ImageMap.put("image/heic", "heic");
contentType_ImageMap.put("image/heif", "heif");
}
private static Map<String, String> contentType_LayoutDocumentMap = new HashMap();
static {
contentType_LayoutDocumentMap.put("application/pdf", "pdf");
}
/**
* 除了依赖 tika-core包,还依赖 tika-parsers-standard-package 包,才可以快速精准检测出新微软Office文档的content-type,不依赖文件后缀名
*/
private static Map<String, String> contentType_OoxmlDocumentMap = new HashMap();
static {
contentType_OoxmlDocumentMap.put("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "xlsx");
contentType_OoxmlDocumentMap.put("application/vnd.openxmlformats-officedocument.wordprocessingml.document", "docx");
contentType_OoxmlDocumentMap.put("application/vnd.openxmlformats-officedocument.presentationml.presentation", "pptx");
}
private static Map<String, String> contentType_OfficeDocumentMap = new HashMap();
static {
contentType_OfficeDocumentMap.put("application/msword", "doc");
contentType_OfficeDocumentMap.put("application/vnd.ms-excel", "xls");
}
private static Map<String, String> contentType_ZipMap = new HashMap();
static {
contentType_ZipMap.put("application/ofd", "ofd");
}
//
static String content_type_application_x_tika_ooxml = "application/x-tika-ooxml";
static String content_type_application_x_tika_msoffice = "application/x-tika-msoffice";
static String content_type_application_zip = "application/zip";
public static FileContentTypeDTO getContentType(File file) {
Tika tika = new Tika();
try {
String mimeType = tika.detect(file);
if (contentType_ImageMap.containsKey(mimeType)) {
return FileContentTypeDTO.builder()
.contentType(mimeType)
.fileExtName(contentType_ImageMap.get(mimeType))
.accuracy(100)
.build();
}
if (contentType_LayoutDocumentMap.containsKey(mimeType)) {
return FileContentTypeDTO.builder()
.contentType(mimeType)
.fileExtName(contentType_LayoutDocumentMap.get(mimeType))
.accuracy(100)
.build();
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
// 其他情况,复用 getContentType(InputStream is) 方法
FileInputStream fis = null;
try {
fis = new FileInputStream(file);
return getContentType(fis);
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} finally {
try {
if(fis != null) {
fis.close();
}
} catch (Exception e2) {
// TODO: handle exception
}
}
return null;
}
public static FileContentTypeDTO getContentType(InputStream is) {
InputStream is1 = null, is2 = null;
try {
byte[] buffer = is.readAllBytes();
is1 = new ByteArrayInputStream(buffer);
is2 = new ByteArrayInputStream(buffer);
Tika tika = new Tika();
Metadata metadata = new Metadata();
String mimeType = tika.detect(is1, metadata);
if (contentType_ImageMap.containsKey(mimeType)) {
return FileContentTypeDTO.builder()
.contentType(mimeType)
.fileExtName(contentType_ImageMap.get(mimeType))
.accuracy(100)
.build();
}
if (contentType_LayoutDocumentMap.containsKey(mimeType)) {
return FileContentTypeDTO.builder()
.contentType(mimeType)
.fileExtName(contentType_LayoutDocumentMap.get(mimeType))
.accuracy(100)
.build();
}
// 如果环境中有OOXML的解析器
FileContentTypeDTO dto = parseOoxml(mimeType);
if(Objects.nonNull(dto)) {
return dto;
}
//
if(StringUtils.equalsIgnoreCase(mimeType, content_type_application_x_tika_msoffice)) {
FileContentTypeDTO dto2 = parseMetadata(is2, new OfficeParser(), contentType_OfficeDocumentMap);
if(Objects.nonNull(dto2)) {
return dto2;
}
}
//
if(StringUtils.equalsIgnoreCase(mimeType, content_type_application_zip)) {
FileContentTypeDTO dto2 = parseMetadata(is2, new OFDParser(), contentType_ZipMap);
if(Objects.nonNull(dto2)) {
return dto2;
}
}
System.out.println("-------------------------------得到的ContentType可能不准仅供参考,得到的直接ContentType是 = " + mimeType);
return FileContentTypeDTO.builder()
.contentType(mimeType)
.accuracy(50)
.build();
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
if(is!=null) {
is.close();
}
} catch (IOException e) {
e.printStackTrace();
}
try {
if(is1!=null) {
is1.close();
}
} catch (IOException e) {
e.printStackTrace();
}
try {
if(is2!=null) {
is2.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
return null;
}
private static FileContentTypeDTO parseOoxml(String mimeType) {
if(StringUtils.equalsIgnoreCase(mimeType, content_type_application_x_tika_ooxml)) {
if (contentType_OoxmlDocumentMap.containsKey(mimeType)) {
return FileContentTypeDTO.builder()
.contentType(mimeType)
.fileExtName(contentType_OoxmlDocumentMap.get(mimeType))
.accuracy(100)
.build();
}
}
return null;
}
private static FileContentTypeDTO parseMetadata(InputStream inputstream, Parser parser, Map<String ,String> map) {
BodyContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata(); // empty metadata object
ParseContext context = new ParseContext();
try {
parser.parse(inputstream, handler, metadata, context);
// System.out.println("-=====================");
// System.out.println(handler.toString());
// //getting the list of all meta data elements
// String[] metadataNames = metadata.names();
// for(String name : metadataNames) {
// System.out.println(name + " : " + metadata.get(name));
// }
//
} catch (IOException e) {
e.printStackTrace();
} catch (TikaException e) {
e.printStackTrace();
} catch (SAXException e) {
e.printStackTrace();
}catch(Throwable t){
log.info("===============888888888=============");
t.printStackTrace();
} finally{
try {
if(inputstream != null) {
inputstream.close();
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
String mimeType = metadata.get(HttpHeaders.CONTENT_TYPE);
System.out.println("---------- metadata mimeType is " + mimeType);
if (map.containsKey(mimeType)) {
return FileContentTypeDTO.builder()
.contentType(mimeType)
.fileExtName(map.get(mimeType))
.accuracy(100)
.build();
}
return null;
}
}
FileContentTypeDTO
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class FileContentTypeDTO {
/**
* 文件 content-type
*/
@JsonProperty("content_type")
private String contentType;
/**
* 文件扩展名
*/
@JsonProperty("file_ext_name")
private String fileExtName;
/**
* content-type的精确度,100 表示 100% 准确,其他值表示 不保证一定准确
*/
private Integer accuracy;
/**
* 是否100%精准
*/
@JsonProperty("accuracy100_percent")
private boolean accuracy100Percent;
public boolean getAccuracy100Percent() {
return (accuracy != null && accuracy == 100)?true: false;
}
}