From 71e4e2a41cf4c169910a8ac3f2975dfc0d4b9133 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=AD=8F=E5=A6=82=E8=93=9D?= <10172100262@stu.ecnu.edu.cn> Date: Wed, 13 Jan 2021 16:30:14 +0800 Subject: [PATCH] . --- codes/app.py | 95 +++++++++++++++++++++++++++++ codes/feature.py | 169 +++++++++++++++++++++++++++++++++++++++++++++++++++ codes/pdf_to_pics.py | 91 +++++++++++++++++++++++++++ 3 files changed, 355 insertions(+) create mode 100644 codes/app.py create mode 100644 codes/feature.py create mode 100644 codes/pdf_to_pics.py diff --git a/codes/app.py b/codes/app.py new file mode 100644 index 0000000..26cf508 --- /dev/null +++ b/codes/app.py @@ -0,0 +1,95 @@ +# from flask import Flask +import pdf_to_pics,feature +import os +import uuid +import platform +from flask import Flask,request,redirect,url_for,Blueprint +from werkzeug.utils import secure_filename +from sim import compute_similarity + + +if platform.system() == "Windows": + slash = '\\' +else: + platform.system()=="Linux" + slash = '/' + +UPLOAD_FOLDER = 'upload' +ALLOW_EXTENSIONS = set(['html', 'htm', 'doc', 'docx', 'mht', 'pdf']) + +app = Flask(__name__) +app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER + +#判断文件夹是否存在,如果不存在则创建 +if not os.path.exists(UPLOAD_FOLDER): + os.makedirs(UPLOAD_FOLDER) +else: + pass + +# 判断文件后缀是否在列表中 +def allowed_file(filename): + return '.' in filename and \ + filename.rsplit('.', 1)[1] in ALLOW_EXTENSIONS + +# 上传pdf并调用file2pic函数 +@app.route('/',methods=['GET','POST']) +def upload_file(): + if request.method =='POST': + #获取post过来的文件名称,从name=file参数中获取 + file = request.files['file'] + if file and allowed_file(file.filename): + # secure_filename方法会去掉文件名中的中文 + filename = secure_filename(file.filename) + # 因为上次的文件可能有重名,因此使用uuid保存文件 + file_name = str(uuid.uuid4()) + '.' + filename.rsplit('.', 1)[1] + file.save(os.path.join(app.config['UPLOAD_FOLDER'],file_name)) + base_path = os.getcwd() + file_path = base_path + slash + app.config['UPLOAD_FOLDER'] + slash + file_name + print(file_path) + return redirect(url_for('file2pic',file = file_name)) + # return redirect(url_for('test')) + # return redirect("http://e127.0.0.1:5000/") + return ''' + + Upload new File +

Select PDF to Upload

+
+

+ +

+ ''' + +# 调用pdf_to_pics.py中的类与函数,拆分图片 +@app.route('/tt/') +def file2pic(file): + pics = pdf_to_pics.to_pics() + message = pics.call_pdf2pic(file) + print('file2pic') + return message + + +# 调用sim/compute_similarity.py中的函数计算similarity +@app.route('/ttt/') +def call_sim(path): + the_similarity = compute_similarity.simi() + message = the_similarity.similarity(path) + print("call_sim") + return message + + + +# @app.route('/tttt/') +# def call_feature(alist): +# print("call_feature") +# the_feature = feature.feat() +# message = the_feature.call_feature_extraction(alist) +# # print("call_feature") +# return message + + +if __name__ == "__main__": + app.register_blueprint(pdf_to_pics.bp_2pics) + + # app.run(host='0.0.0.0', port=7000) # IP Port + app.run() + diff --git a/codes/feature.py b/codes/feature.py new file mode 100644 index 0000000..d5b1c49 --- /dev/null +++ b/codes/feature.py @@ -0,0 +1,169 @@ +from __future__ import print_function +import cv2 +import numpy as np +import os +from flask import render_template +import zipfile + +MAX_MATCHES = 500 +GOOD_MATCH_PERCENT = 0.15 + + + +class feat: + def call_feature_extraction_1(self, folder_path, list, index, tmp): + # 按相似度排序 + list = sorted(list, key=(lambda x: [x[2], x[5]]),reverse=True) + print(list) + print('---------------') + print(index) + + for i in range(int(index/10)): + # 取第一组比较并返回 + refFilename = list[i][0] + # imgname1 = '/home/Jupyterlab/wrl/pic/xiagao/pic/11.jpeg' + print("Reading reference image : ", refFilename) + imReference = cv2.imread(refFilename, cv2.IMREAD_COLOR) + + imFilename = list[i][1] + # imgname2 = '/home/Jupyterlab/wrl/pic/xiagao/pic/12.jpeg' + print("Reading image to align : ", imFilename); + im = cv2.imread(imFilename, cv2.IMREAD_COLOR) + + print(refFilename) + print(imFilename) + print(folder_path[7:]) + p1 = refFilename.rfind('/') + name1 = refFilename[p1:-4] + print(name1) + p2 = imFilename.rfind('/') + name2 = imFilename[p2 + 1:] + print(name2) + + # Write aligned image to disk. + outFilename = "output/" + folder_path[7:] + pre = os.getcwd() + print("Saving aligned image : ", outFilename) + + print("Aligning images ...") + # Registered image will be resotred in imReg. + # The estimated homography will be stored in h. + imReg, h, img5 = feat().alignImages(im, imReference) + if (str(img5) == 'white'): + print('white') + # continue + else: + print(outFilename) + # imgwrite需要建好路径!!!!!! + if not os.path.exists(outFilename): + os.makedirs(outFilename) + outFilename1 = outFilename + name1 + name2 + cv2.imwrite(outFilename1, img5) + # continue + + outFullName = feat.zipDir(outFilename) + pre1 = "/home/wwwroot/default/" + outFilename + ".zip" + # return str(pre) + '/' + outFilename + # return pre1 + return outFullName + + + + + def alignImages(self, im1, im2): + # Convert images to grayscale + im1Gray = cv2.cvtColor(im1, cv2.COLOR_BGR2GRAY) + im2Gray = cv2.cvtColor(im2, cv2.COLOR_BGR2GRAY) + + # Detect ORB features and compute descriptors. + orb = cv2.ORB_create(MAX_MATCHES) + keypoints1, descriptors1 = orb.detectAndCompute(im1Gray, None) + keypoints2, descriptors2 = orb.detectAndCompute(im2Gray, None) + + if keypoints1 and keypoints2: + # Match features. + matcher = cv2.DescriptorMatcher_create(cv2.DESCRIPTOR_MATCHER_BRUTEFORCE_HAMMING) + matches = matcher.match(descriptors1, descriptors2, None) + + # Sort matches by score + matches.sort(key=lambda x: x.distance, reverse=False) + + # Remove not so good matches + numGoodMatches = int(len(matches) * GOOD_MATCH_PERCENT) + matches = matches[:numGoodMatches] + + # Draw top matches + imMatches = cv2.drawMatches(im1, keypoints1, im2, keypoints2, matches, None) + cv2.imwrite("matches.jpg", imMatches) + + # Extract location of good matches + points1 = np.zeros((len(matches), 2), dtype=np.float32) + points2 = np.zeros((len(matches), 2), dtype=np.float32) + + for i, match in enumerate(matches): + points1[i, :] = keypoints1[match.queryIdx].pt + points2[i, :] = keypoints2[match.trainIdx].pt + + if (points1.size == 0) or (points2.size == 0): + return 'white', 'white', 'white' + + # Find homography + h, mask = cv2.findHomography(points1, points2, cv2.RANSAC) + + # Use homography + height, width, channels = im2.shape + im1Reg = cv2.warpPerspective(im1, h, (width, height)) + img1 = cv2.drawKeypoints(im1, keypoints1, im1, color=(255, 0, 255)) + img2 = cv2.drawKeypoints(im2, keypoints2, im2, color=(255, 0, 255)) + img5 = cv2.drawMatches(img1, keypoints1, img2, keypoints2, matches, None, flags=2) + + return im1Reg, h, img5 + + else: + return 'white', 'white', 'white' + + def return_img_stream(self, img_local_path): + # """ + # 工具函数: + # 获取本地图片流 + # :param img_local_path:文件单张图片的本地绝对路径 + # :return: 图片流 + # """ + import base64 + img_stream = '' + with open(img_local_path, 'r') as img_f: + img_stream = img_f.read() + img_stream = base64.b64encode(img_stream) + return img_local_path + + + + def zipDir(dirpath): + """ + 压缩指定文件夹 + :param dirpath: 目标文件夹路径 + :param outFullName: 压缩文件保存路径+xxxx.zip + :return: 无 + """ + outFullName = "/home/wwwroot/default/" + dirpath[21:] + ".zip" + outFullName1 = "http://106.75.226.23/" + dirpath[21:] + ".zip" + # if not os.path.exists(outFullName): + # os.makedirs(outFullName) + zip = zipfile.ZipFile(outFullName, "w", zipfile.ZIP_DEFLATED) + for path, dirnames, filenames in os.walk(dirpath): + # 去掉目标跟路径,只对目标文件夹下边的文件及文件夹进行压缩 + fpath = path.replace(dirpath, '') + + for filename in filenames: + zip.write(os.path.join(path, filename), os.path.join(fpath, filename)) + zip.close() + return outFullName1 + + + +# im1 = '/Users/wrl/Desktop/test0506/pic/2303.png' +# im2 = '/Users/wrl/Desktop/test0506/pic/2304.png' +# img1 = cv2.imread(im1, cv2.IMREAD_COLOR) +# img2 = cv2.imread(im2, cv2.IMREAD_COLOR) +# imReg, h, img5 =feat().alignImages(img1,img2) + diff --git a/codes/pdf_to_pics.py b/codes/pdf_to_pics.py new file mode 100644 index 0000000..6406446 --- /dev/null +++ b/codes/pdf_to_pics.py @@ -0,0 +1,91 @@ +import fitz +import time +import re +import os +from sim import compute_similarity +from flask import redirect,url_for,Blueprint +from PIL import Image +import math +import operator +from functools import reduce + +bp_2pics = Blueprint("2pics", __name__,url_prefix="/2pics") +# 这个bp似乎没啥用,,? + +class to_pics: + def pdf2pic(self, path, pic_path): + + t0 = time.clock() + # 使用正则表达式来查找图片 + checkXO = r"/Type(?= */XObject)" + checkIM = r"/Subtype(?= */Image)" + + # 打开pdf + doc = fitz.open(path) + # 图片计数 + imgcount = 0 + lenXREF = doc._getXrefLength() + print(lenXREF) + + # 打印PDF的信息 + print("文件名:{}, 页数: {}, 对象: {}".format(path, len(doc), lenXREF - 1)) + + # 遍历每一个对象 + for i in range(1, lenXREF): + # 定义对象字符串 + text = doc._getXrefString(i) + isXObject = re.search(checkXO, text) + # 使用正则表达式查看是否是图片 + isImage = re.search(checkIM, text) + + # 如果不是对象也不是图片,则continue + if not isXObject or not isImage: + continue + imgcount += 1 + # 根据索引生成图像 + pix = fitz.Pixmap(doc, i) + # 根据pdf的路径生成图片的名称 + # new_name = path.replace('\\', '_') + "_img{}.png".format(imgcount) + new_name = "img{}.png".format(imgcount) + new_name = new_name.replace(':', '') + + # 如果pix.n<5,可以直接存为PNG + if pix.n < 5: + pix.writePNG(os.path.join(pic_path, new_name)) + # 否则先转换CMYK + else: + pix0 = fitz.Pixmap(fitz.csRGB, pix) + pix0.writePNG(os.path.join(pic_path, new_name)) + pix0 = None + # 释放资源 + pix = None + t1 = time.clock() + print("运行时间:{}s".format(t1 - t0)) + print("提取了{}张图片".format(imgcount)) + return str(imgcount) + + + def call_pdf2pic(self, file): + # pdf路径 + path = './upload/' + file + pic_path = path[:-4] + # 创建保存图片的文件夹 + if os.path.exists(pic_path): + print("文件夹已存在,请重新创建新文件夹!") + return "already exist" + # raise SystemExit + else: + os.mkdir(pic_path) + print(pic_path) + m = to_pics().pdf2pic(path, pic_path) + # return m + return redirect(url_for('call_sim',path = file[:-4])) + # return redirect(url_for('call_sim', m=m, pic_path=pic_path)) + + +# @bp_2pics.route('/tt//') +# def call_sim(m, pic_path): +# message = app.similarity(m, pic_path) +# print("call_sim") +# return message +