魏如蓝 3 years ago
parent
commit
71e4e2a41c
3 changed files with 355 additions and 0 deletions
  1. +95
    -0
      codes/app.py
  2. +169
    -0
      codes/feature.py
  3. +91
    -0
      codes/pdf_to_pics.py

+ 95
- 0
codes/app.py View File

@ -0,0 +1,95 @@
# from flask import Flask
import pdf_to_pics,feature
import os
import uuid
import platform
from flask import Flask,request,redirect,url_for,Blueprint
from werkzeug.utils import secure_filename
from sim import compute_similarity
if platform.system() == "Windows":
slash = '\\'
else:
platform.system()=="Linux"
slash = '/'
UPLOAD_FOLDER = 'upload'
ALLOW_EXTENSIONS = set(['html', 'htm', 'doc', 'docx', 'mht', 'pdf'])
app = Flask(__name__)
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
#判断文件夹是否存在,如果不存在则创建
if not os.path.exists(UPLOAD_FOLDER):
os.makedirs(UPLOAD_FOLDER)
else:
pass
# 判断文件后缀是否在列表中
def allowed_file(filename):
return '.' in filename and \
filename.rsplit('.', 1)[1] in ALLOW_EXTENSIONS
# 上传pdf并调用file2pic函数
@app.route('/',methods=['GET','POST'])
def upload_file():
if request.method =='POST':
#获取post过来的文件名称,从name=file参数中获取
file = request.files['file']
if file and allowed_file(file.filename):
# secure_filename方法会去掉文件名中的中文
filename = secure_filename(file.filename)
# 因为上次的文件可能有重名,因此使用uuid保存文件
file_name = str(uuid.uuid4()) + '.' + filename.rsplit('.', 1)[1]
file.save(os.path.join(app.config['UPLOAD_FOLDER'],file_name))
base_path = os.getcwd()
file_path = base_path + slash + app.config['UPLOAD_FOLDER'] + slash + file_name
print(file_path)
return redirect(url_for('file2pic',file = file_name))
# return redirect(url_for('test'))
# return redirect("http://e127.0.0.1:5000/")
return '''
<!doctype html>
<title>Upload new File</title>
<h1>Select PDF to Upload</h1>
<form action="" method=post enctype=multipart/form-data>
<p><input type=file name=file>
<input type=submit value=Upload>
</form>
'''
# 调用pdf_to_pics.py中的类与函数,拆分图片
@app.route('/tt/<file>')
def file2pic(file):
pics = pdf_to_pics.to_pics()
message = pics.call_pdf2pic(file)
print('file2pic')
return message
# 调用sim/compute_similarity.py中的函数计算similarity
@app.route('/ttt/<path>')
def call_sim(path):
the_similarity = compute_similarity.simi()
message = the_similarity.similarity(path)
print("call_sim")
return message
# @app.route('/tttt/<alist>')
# def call_feature(alist):
# print("call_feature")
# the_feature = feature.feat()
# message = the_feature.call_feature_extraction(alist)
# # print("call_feature")
# return message
if __name__ == "__main__":
app.register_blueprint(pdf_to_pics.bp_2pics)
# app.run(host='0.0.0.0', port=7000) # IP Port
app.run()

+ 169
- 0
codes/feature.py View File

@ -0,0 +1,169 @@
from __future__ import print_function
import cv2
import numpy as np
import os
from flask import render_template
import zipfile
MAX_MATCHES = 500
GOOD_MATCH_PERCENT = 0.15
class feat:
def call_feature_extraction_1(self, folder_path, list, index, tmp):
# 按相似度排序
list = sorted(list, key=(lambda x: [x[2], x[5]]),reverse=True)
print(list)
print('---------------')
print(index)
for i in range(int(index/10)):
# 取第一组比较并返回
refFilename = list[i][0]
# imgname1 = '/home/Jupyterlab/wrl/pic/xiagao/pic/11.jpeg'
print("Reading reference image : ", refFilename)
imReference = cv2.imread(refFilename, cv2.IMREAD_COLOR)
imFilename = list[i][1]
# imgname2 = '/home/Jupyterlab/wrl/pic/xiagao/pic/12.jpeg'
print("Reading image to align : ", imFilename);
im = cv2.imread(imFilename, cv2.IMREAD_COLOR)
print(refFilename)
print(imFilename)
print(folder_path[7:])
p1 = refFilename.rfind('/')
name1 = refFilename[p1:-4]
print(name1)
p2 = imFilename.rfind('/')
name2 = imFilename[p2 + 1:]
print(name2)
# Write aligned image to disk.
outFilename = "output/" + folder_path[7:]
pre = os.getcwd()
print("Saving aligned image : ", outFilename)
print("Aligning images ...")
# Registered image will be resotred in imReg.
# The estimated homography will be stored in h.
imReg, h, img5 = feat().alignImages(im, imReference)
if (str(img5) == 'white'):
print('white')
# continue
else:
print(outFilename)
# imgwrite需要建好路径!!!!!!
if not os.path.exists(outFilename):
os.makedirs(outFilename)
outFilename1 = outFilename + name1 + name2
cv2.imwrite(outFilename1, img5)
# continue
outFullName = feat.zipDir(outFilename)
pre1 = "/home/wwwroot/default/" + outFilename + ".zip"
# return str(pre) + '/' + outFilename
# return pre1
return outFullName
def alignImages(self, im1, im2):
# Convert images to grayscale
im1Gray = cv2.cvtColor(im1, cv2.COLOR_BGR2GRAY)
im2Gray = cv2.cvtColor(im2, cv2.COLOR_BGR2GRAY)
# Detect ORB features and compute descriptors.
orb = cv2.ORB_create(MAX_MATCHES)
keypoints1, descriptors1 = orb.detectAndCompute(im1Gray, None)
keypoints2, descriptors2 = orb.detectAndCompute(im2Gray, None)
if keypoints1 and keypoints2:
# Match features.
matcher = cv2.DescriptorMatcher_create(cv2.DESCRIPTOR_MATCHER_BRUTEFORCE_HAMMING)
matches = matcher.match(descriptors1, descriptors2, None)
# Sort matches by score
matches.sort(key=lambda x: x.distance, reverse=False)
# Remove not so good matches
numGoodMatches = int(len(matches) * GOOD_MATCH_PERCENT)
matches = matches[:numGoodMatches]
# Draw top matches
imMatches = cv2.drawMatches(im1, keypoints1, im2, keypoints2, matches, None)
cv2.imwrite("matches.jpg", imMatches)
# Extract location of good matches
points1 = np.zeros((len(matches), 2), dtype=np.float32)
points2 = np.zeros((len(matches), 2), dtype=np.float32)
for i, match in enumerate(matches):
points1[i, :] = keypoints1[match.queryIdx].pt
points2[i, :] = keypoints2[match.trainIdx].pt
if (points1.size == 0) or (points2.size == 0):
return 'white', 'white', 'white'
# Find homography
h, mask = cv2.findHomography(points1, points2, cv2.RANSAC)
# Use homography
height, width, channels = im2.shape
im1Reg = cv2.warpPerspective(im1, h, (width, height))
img1 = cv2.drawKeypoints(im1, keypoints1, im1, color=(255, 0, 255))
img2 = cv2.drawKeypoints(im2, keypoints2, im2, color=(255, 0, 255))
img5 = cv2.drawMatches(img1, keypoints1, img2, keypoints2, matches, None, flags=2)
return im1Reg, h, img5
else:
return 'white', 'white', 'white'
def return_img_stream(self, img_local_path):
# """
# 工具函数:
# 获取本地图片流
# :param img_local_path:文件单张图片的本地绝对路径
# :return: 图片流
# """
import base64
img_stream = ''
with open(img_local_path, 'r') as img_f:
img_stream = img_f.read()
img_stream = base64.b64encode(img_stream)
return img_local_path
def zipDir(dirpath):
"""
:param dirpath:
:param outFullName: +xxxx.zip
:return:
"""
outFullName = "/home/wwwroot/default/" + dirpath[21:] + ".zip"
outFullName1 = "http://106.75.226.23/" + dirpath[21:] + ".zip"
# if not os.path.exists(outFullName):
# os.makedirs(outFullName)
zip = zipfile.ZipFile(outFullName, "w", zipfile.ZIP_DEFLATED)
for path, dirnames, filenames in os.walk(dirpath):
# 去掉目标跟路径,只对目标文件夹下边的文件及文件夹进行压缩
fpath = path.replace(dirpath, '')
for filename in filenames:
zip.write(os.path.join(path, filename), os.path.join(fpath, filename))
zip.close()
return outFullName1
# im1 = '/Users/wrl/Desktop/test0506/pic/2303.png'
# im2 = '/Users/wrl/Desktop/test0506/pic/2304.png'
# img1 = cv2.imread(im1, cv2.IMREAD_COLOR)
# img2 = cv2.imread(im2, cv2.IMREAD_COLOR)
# imReg, h, img5 =feat().alignImages(img1,img2)

+ 91
- 0
codes/pdf_to_pics.py View File

@ -0,0 +1,91 @@
import fitz
import time
import re
import os
from sim import compute_similarity
from flask import redirect,url_for,Blueprint
from PIL import Image
import math
import operator
from functools import reduce
bp_2pics = Blueprint("2pics", __name__,url_prefix="/2pics")
# 这个bp似乎没啥用,,?
class to_pics:
def pdf2pic(self, path, pic_path):
t0 = time.clock()
# 使用正则表达式来查找图片
checkXO = r"/Type(?= */XObject)"
checkIM = r"/Subtype(?= */Image)"
# 打开pdf
doc = fitz.open(path)
# 图片计数
imgcount = 0
lenXREF = doc._getXrefLength()
print(lenXREF)
# 打印PDF的信息
print("文件名:{}, 页数: {}, 对象: {}".format(path, len(doc), lenXREF - 1))
# 遍历每一个对象
for i in range(1, lenXREF):
# 定义对象字符串
text = doc._getXrefString(i)
isXObject = re.search(checkXO, text)
# 使用正则表达式查看是否是图片
isImage = re.search(checkIM, text)
# 如果不是对象也不是图片,则continue
if not isXObject or not isImage:
continue
imgcount += 1
# 根据索引生成图像
pix = fitz.Pixmap(doc, i)
# 根据pdf的路径生成图片的名称
# new_name = path.replace('\\', '_') + "_img{}.png".format(imgcount)
new_name = "img{}.png".format(imgcount)
new_name = new_name.replace(':', '')
# 如果pix.n<5,可以直接存为PNG
if pix.n < 5:
pix.writePNG(os.path.join(pic_path, new_name))
# 否则先转换CMYK
else:
pix0 = fitz.Pixmap(fitz.csRGB, pix)
pix0.writePNG(os.path.join(pic_path, new_name))
pix0 = None
# 释放资源
pix = None
t1 = time.clock()
print("运行时间:{}s".format(t1 - t0))
print("提取了{}张图片".format(imgcount))
return str(imgcount)
def call_pdf2pic(self, file):
# pdf路径
path = './upload/' + file
pic_path = path[:-4]
# 创建保存图片的文件夹
if os.path.exists(pic_path):
print("文件夹已存在,请重新创建新文件夹!")
return "already exist"
# raise SystemExit
else:
os.mkdir(pic_path)
print(pic_path)
m = to_pics().pdf2pic(path, pic_path)
# return m
return redirect(url_for('call_sim',path = file[:-4]))
# return redirect(url_for('call_sim', m=m, pic_path=pic_path))
# @bp_2pics.route('/tt/<m>/<pic_path>')
# def call_sim(m, pic_path):
# message = app.similarity(m, pic_path)
# print("call_sim")
# return message

Loading…
Cancel
Save