From 71e4e2a41cf4c169910a8ac3f2975dfc0d4b9133 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=AD=8F=E5=A6=82=E8=93=9D?= <10172100262@stu.ecnu.edu.cn>
Date: Wed, 13 Jan 2021 16:30:14 +0800
Subject: [PATCH] .

---
 codes/app.py         |  95 +++++++++++++++++++++++++++++
 codes/feature.py     | 169 +++++++++++++++++++++++++++++++++++++++++++++++++++
 codes/pdf_to_pics.py |  91 +++++++++++++++++++++++++++
 3 files changed, 355 insertions(+)
 create mode 100644 codes/app.py
 create mode 100644 codes/feature.py
 create mode 100644 codes/pdf_to_pics.py
diff --git a/codes/app.py b/codes/app.py
new file mode 100644
index 0000000..26cf508
--- /dev/null
+++ b/codes/app.py
@@ -0,0 +1,95 @@
+# from flask import Flask
+import pdf_to_pics,feature
+import os
+import uuid
+import platform
+from flask import Flask,request,redirect,url_for,Blueprint
+from werkzeug.utils import secure_filename
+from sim import compute_similarity
+
+
+if platform.system() == "Windows":
+    slash = '\\'
+else:
+    platform.system()=="Linux"
+    slash = '/'
+
+UPLOAD_FOLDER = 'upload'
+ALLOW_EXTENSIONS = set(['html', 'htm', 'doc', 'docx', 'mht', 'pdf'])
+
+app = Flask(__name__)
+app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
+
+#判断文件夹是否存在，如果不存在则创建
+if not os.path.exists(UPLOAD_FOLDER):
+    os.makedirs(UPLOAD_FOLDER)
+else:
+    pass
+
+# 判断文件后缀是否在列表中
+def allowed_file(filename):
+    return '.' in filename and \
+            filename.rsplit('.', 1)[1] in ALLOW_EXTENSIONS
+
+# 上传pdf并调用file2pic函数
+@app.route('/',methods=['GET','POST'])
+def upload_file():
+    if request.method =='POST':
+        #获取post过来的文件名称，从name=file参数中获取
+        file = request.files['file']
+        if file and allowed_file(file.filename):
+            # secure_filename方法会去掉文件名中的中文
+            filename = secure_filename(file.filename)
+            # 因为上次的文件可能有重名，因此使用uuid保存文件
+            file_name = str(uuid.uuid4()) + '.' + filename.rsplit('.', 1)[1]
+            file.save(os.path.join(app.config['UPLOAD_FOLDER'],file_name))
+            base_path = os.getcwd()
+            file_path = base_path + slash + app.config['UPLOAD_FOLDER'] + slash + file_name
+            print(file_path)
+            return redirect(url_for('file2pic',file = file_name))
+            # return redirect(url_for('test'))
+            # return redirect("http://e127.0.0.1:5000/")
+    return '''
+    <!doctype html>
+    <title>Upload new File</title>
+    <h1>Select PDF to Upload</h1>
+    <form action="" method=post enctype=multipart/form-data>
+      <p><input type=file name=file>
+         <input type=submit value=Upload>
+    </form>
+    '''
+
+# 调用pdf_to_pics.py中的类与函数，拆分图片
+@app.route('/tt/<file>')
+def file2pic(file):
+    pics = pdf_to_pics.to_pics()
+    message = pics.call_pdf2pic(file)
+    print('file2pic')
+    return message
+
+
+# 调用sim/compute_similarity.py中的函数计算similarity
+@app.route('/ttt/<path>')
+def call_sim(path):
+    the_similarity = compute_similarity.simi()
+    message = the_similarity.similarity(path)
+    print("call_sim")
+    return message
+
+
+
+# @app.route('/tttt/<alist>')
+# def call_feature(alist):
+#     print("call_feature")
+#     the_feature = feature.feat()
+#     message = the_feature.call_feature_extraction(alist)
+#     # print("call_feature")
+#     return message
+
+
+if __name__ == "__main__":
+    app.register_blueprint(pdf_to_pics.bp_2pics)
+
+    # app.run(host='0.0.0.0', port=7000)  # IP Port
+    app.run()
+
diff --git a/codes/feature.py b/codes/feature.py
new file mode 100644
index 0000000..d5b1c49
--- /dev/null
+++ b/codes/feature.py
@@ -0,0 +1,169 @@
+from __future__ import print_function
+import cv2
+import numpy as np
+import os
+from flask import render_template
+import zipfile
+
+MAX_MATCHES = 500
+GOOD_MATCH_PERCENT = 0.15
+
+
+
+class feat:
+    def call_feature_extraction_1(self, folder_path, list, index, tmp):
+        # 按相似度排序
+        list = sorted(list, key=(lambda x: [x[2], x[5]]),reverse=True)
+        print(list)
+        print('---------------')
+        print(index)
+
+        for i in range(int(index/10)):
+            # 取第一组比较并返回
+            refFilename = list[i][0]
+            # imgname1 = '/home/Jupyterlab/wrl/pic/xiagao/pic/11.jpeg'
+            print("Reading reference image : ", refFilename)
+            imReference = cv2.imread(refFilename, cv2.IMREAD_COLOR)
+
+            imFilename = list[i][1]
+            # imgname2 = '/home/Jupyterlab/wrl/pic/xiagao/pic/12.jpeg'
+            print("Reading image to align : ", imFilename);
+            im = cv2.imread(imFilename, cv2.IMREAD_COLOR)
+
+            print(refFilename)
+            print(imFilename)
+            print(folder_path[7:])
+            p1 = refFilename.rfind('/')
+            name1 = refFilename[p1:-4]
+            print(name1)
+            p2 = imFilename.rfind('/')
+            name2 = imFilename[p2 + 1:]
+            print(name2)
+
+            # Write aligned image to disk.
+            outFilename = "output/" + folder_path[7:]
+            pre = os.getcwd()
+            print("Saving aligned image : ", outFilename)
+
+            print("Aligning images ...")
+            # Registered image will be resotred in imReg.
+            # The estimated homography will be stored in h.
+            imReg, h, img5 = feat().alignImages(im, imReference)
+            if (str(img5) == 'white'):
+                print('white')
+                # continue
+            else:
+                print(outFilename)
+                # imgwrite需要建好路径！！！！！！
+                if not os.path.exists(outFilename):
+                    os.makedirs(outFilename)
+                outFilename1 = outFilename + name1 + name2
+                cv2.imwrite(outFilename1, img5)
+                # continue
+
+        outFullName = feat.zipDir(outFilename)
+        pre1 = "/home/wwwroot/default/" + outFilename + ".zip"
+        # return str(pre) + '/' + outFilename
+        # return pre1
+        return outFullName
+
+
+
+
+    def alignImages(self, im1, im2):
+        # Convert images to grayscale
+        im1Gray = cv2.cvtColor(im1, cv2.COLOR_BGR2GRAY)
+        im2Gray = cv2.cvtColor(im2, cv2.COLOR_BGR2GRAY)
+
+        # Detect ORB features and compute descriptors.
+        orb = cv2.ORB_create(MAX_MATCHES)
+        keypoints1, descriptors1 = orb.detectAndCompute(im1Gray, None)
+        keypoints2, descriptors2 = orb.detectAndCompute(im2Gray, None)
+
+        if keypoints1  and keypoints2:
+            # Match features.
+            matcher = cv2.DescriptorMatcher_create(cv2.DESCRIPTOR_MATCHER_BRUTEFORCE_HAMMING)
+            matches = matcher.match(descriptors1, descriptors2, None)
+
+            # Sort matches by score
+            matches.sort(key=lambda x: x.distance, reverse=False)
+
+            # Remove not so good matches
+            numGoodMatches = int(len(matches) * GOOD_MATCH_PERCENT)
+            matches = matches[:numGoodMatches]
+
+            # Draw top matches
+            imMatches = cv2.drawMatches(im1, keypoints1, im2, keypoints2, matches, None)
+            cv2.imwrite("matches.jpg", imMatches)
+
+            # Extract location of good matches
+            points1 = np.zeros((len(matches), 2), dtype=np.float32)
+            points2 = np.zeros((len(matches), 2), dtype=np.float32)
+
+            for i, match in enumerate(matches):
+                points1[i, :] = keypoints1[match.queryIdx].pt
+                points2[i, :] = keypoints2[match.trainIdx].pt
+
+            if (points1.size == 0) or (points2.size == 0):
+                return 'white', 'white', 'white'
+
+            # Find homography
+            h, mask = cv2.findHomography(points1, points2, cv2.RANSAC)
+
+            # Use homography
+            height, width, channels = im2.shape
+            im1Reg = cv2.warpPerspective(im1, h, (width, height))
+            img1 = cv2.drawKeypoints(im1, keypoints1, im1, color=(255, 0, 255))
+            img2 = cv2.drawKeypoints(im2, keypoints2, im2, color=(255, 0, 255))
+            img5 = cv2.drawMatches(img1, keypoints1, img2, keypoints2, matches, None, flags=2)
+
+            return im1Reg, h, img5
+
+        else:
+            return 'white', 'white', 'white'
+
+    def return_img_stream(self, img_local_path):
+        # """
+        # 工具函数:
+        # 获取本地图片流
+        # :param img_local_path:文件单张图片的本地绝对路径
+        # :return: 图片流
+        # """
+        import base64
+        img_stream = ''
+        with open(img_local_path, 'r') as img_f:
+            img_stream = img_f.read()
+            img_stream = base64.b64encode(img_stream)
+        return img_local_path
+
+
+
+    def zipDir(dirpath):
+        """
+        压缩指定文件夹
+        :param dirpath: 目标文件夹路径
+        :param outFullName: 压缩文件保存路径+xxxx.zip
+        :return: 无
+        """
+        outFullName = "/home/wwwroot/default/" + dirpath[21:] + ".zip"
+        outFullName1 = "http://106.75.226.23/"  + dirpath[21:] + ".zip"
+        # if not os.path.exists(outFullName):
+        #     os.makedirs(outFullName)
+        zip = zipfile.ZipFile(outFullName, "w", zipfile.ZIP_DEFLATED)
+        for path, dirnames, filenames in os.walk(dirpath):
+            # 去掉目标跟路径，只对目标文件夹下边的文件及文件夹进行压缩
+            fpath = path.replace(dirpath, '')
+
+            for filename in filenames:
+                zip.write(os.path.join(path, filename), os.path.join(fpath, filename))
+        zip.close()
+        return outFullName1
+
+
+
+# im1 = '/Users/wrl/Desktop/test0506/pic/2303.png'
+# im2 = '/Users/wrl/Desktop/test0506/pic/2304.png'
+# img1 = cv2.imread(im1, cv2.IMREAD_COLOR)
+# img2 = cv2.imread(im2, cv2.IMREAD_COLOR)
+# imReg, h, img5 =feat().alignImages(img1,img2)
+
diff --git a/codes/pdf_to_pics.py b/codes/pdf_to_pics.py
new file mode 100644
index 0000000..6406446
--- /dev/null
+++ b/codes/pdf_to_pics.py
@@ -0,0 +1,91 @@
+import fitz
+import time
+import re
+import os
+from sim import compute_similarity
+from flask import redirect,url_for,Blueprint
+from PIL import Image
+import math
+import operator
+from functools import reduce
+
+bp_2pics = Blueprint("2pics", __name__,url_prefix="/2pics")
+# 这个bp似乎没啥用，，？
+
+class to_pics:
+    def pdf2pic(self, path, pic_path):
+
+        t0 = time.clock()
+        # 使用正则表达式来查找图片
+        checkXO = r"/Type(?= */XObject)"
+        checkIM = r"/Subtype(?= */Image)"
+
+        # 打开pdf
+        doc = fitz.open(path)
+        # 图片计数
+        imgcount = 0
+        lenXREF = doc._getXrefLength()
+        print(lenXREF)
+
+        # 打印PDF的信息
+        print("文件名:{}, 页数: {}, 对象: {}".format(path, len(doc), lenXREF - 1))
+
+        # 遍历每一个对象
+        for i in range(1, lenXREF):
+            # 定义对象字符串
+            text = doc._getXrefString(i)
+            isXObject = re.search(checkXO, text)
+            # 使用正则表达式查看是否是图片
+            isImage = re.search(checkIM, text)
+
+            # 如果不是对象也不是图片，则continue
+            if not isXObject or not isImage:
+                continue
+            imgcount += 1
+            # 根据索引生成图像
+            pix = fitz.Pixmap(doc, i)
+            # 根据pdf的路径生成图片的名称
+            # new_name = path.replace('\\', '_') + "_img{}.png".format(imgcount)
+            new_name = "img{}.png".format(imgcount)
+            new_name = new_name.replace(':', '')
+
+            # 如果pix.n<5,可以直接存为PNG
+            if pix.n < 5:
+                pix.writePNG(os.path.join(pic_path, new_name))
+            # 否则先转换CMYK
+            else:
+                pix0 = fitz.Pixmap(fitz.csRGB, pix)
+                pix0.writePNG(os.path.join(pic_path, new_name))
+                pix0 = None
+        # 释放资源
+        pix = None
+        t1 = time.clock()
+        print("运行时间:{}s".format(t1 - t0))
+        print("提取了{}张图片".format(imgcount))
+        return str(imgcount)
+
+
+    def call_pdf2pic(self, file):
+        # pdf路径
+        path = './upload/' + file
+        pic_path = path[:-4]
+        # 创建保存图片的文件夹
+        if os.path.exists(pic_path):
+            print("文件夹已存在，请重新创建新文件夹！")
+            return "already exist"
+            # raise SystemExit
+        else:
+            os.mkdir(pic_path)
+            print(pic_path)
+            m = to_pics().pdf2pic(path, pic_path)
+            # return m
+            return redirect(url_for('call_sim',path = file[:-4]))
+            # return redirect(url_for('call_sim', m=m, pic_path=pic_path))
+
+
+# @bp_2pics.route('/tt/<m>/<pic_path>')
+# def call_sim(m, pic_path):
+#     message = app.similarity(m, pic_path)
+#     print("call_sim")
+#     return message
+