.

3 vuotta sitten · 71e4e2a41c
--- a/codes/app.py
+++ b/codes/app.py
@ -0,0 +1,95 @@
 # from flask import Flask
 import pdf_to_pics,feature
 import os
 import uuid
 import platform
 from flask import Flask,request,redirect,url_for,Blueprint
 from werkzeug.utils import secure_filename
 from sim import compute_similarity
 if platform.system() == "Windows":
    slash = '\\'
 else:
    platform.system()=="Linux"
    slash = '/'
 UPLOAD_FOLDER = 'upload'
 ALLOW_EXTENSIONS = set(['html', 'htm', 'doc', 'docx', 'mht', 'pdf'])
 app = Flask(__name__)
 app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
 #判断文件夹是否存在，如果不存在则创建
 if not os.path.exists(UPLOAD_FOLDER):
    os.makedirs(UPLOAD_FOLDER)
 else:
    pass
 # 判断文件后缀是否在列表中
 def allowed_file(filename):
    return '.' in filename and \
            filename.rsplit('.', 1)[1] in ALLOW_EXTENSIONS
 # 上传pdf并调用file2pic函数
@app.route('/',methods=['GET','POST'])
 def upload_file():
    if request.method =='POST':
        #获取post过来的文件名称，从name=file参数中获取
        file = request.files['file']
        if file and allowed_file(file.filename):
            # secure_filename方法会去掉文件名中的中文
            filename = secure_filename(file.filename)
            # 因为上次的文件可能有重名，因此使用uuid保存文件
            file_name = str(uuid.uuid4()) + '.' + filename.rsplit('.', 1)[1]
            file.save(os.path.join(app.config['UPLOAD_FOLDER'],file_name))
            base_path = os.getcwd()
            file_path = base_path + slash + app.config['UPLOAD_FOLDER'] + slash + file_name
            print(file_path)
            return redirect(url_for('file2pic',file = file_name))
            # return redirect(url_for('test'))
            # return redirect("http://e127.0.0.1:5000/")
    return '''
    <!doctype html>
    <title>Upload new File</title>
    <h1>Select PDF to Upload</h1>
    <form action="" method=post enctype=multipart/form-data>
      <p><input type=file name=file>
         <input type=submit value=Upload>
    </form>
    '''
 # 调用pdf_to_pics.py中的类与函数，拆分图片
@app.route('/tt/<file>')
 def file2pic(file):
    pics = pdf_to_pics.to_pics()
    message = pics.call_pdf2pic(file)
    print('file2pic')
    return message
 # 调用sim/compute_similarity.py中的函数计算similarity
@app.route('/ttt/<path>')
 def call_sim(path):
    the_similarity = compute_similarity.simi()
    message = the_similarity.similarity(path)
    print("call_sim")
    return message
 # @app.route('/tttt/<alist>')
 # def call_feature(alist):
 #     print("call_feature")
 #     the_feature = feature.feat()
 #     message = the_feature.call_feature_extraction(alist)
 #     # print("call_feature")
 #     return message
 if __name__ == "__main__":
    app.register_blueprint(pdf_to_pics.bp_2pics)
    # app.run(host='0.0.0.0', port=7000)  # IP Port
    app.run()
--- a/codes/feature.py
+++ b/codes/feature.py
@ -0,0 +1,169 @@
 from __future__ import print_function
 import cv2
 import numpy as np
 import os
 from flask import render_template
 import zipfile
 MAX_MATCHES = 500
 GOOD_MATCH_PERCENT = 0.15
 class feat:
    def call_feature_extraction_1(self, folder_path, list, index, tmp):
        # 按相似度排序
        list = sorted(list, key=(lambda x: [x[2], x[5]]),reverse=True)
        print(list)
        print('---------------')
        print(index)
        for i in range(int(index/10)):
            # 取第一组比较并返回
            refFilename = list[i][0]
            # imgname1 = '/home/Jupyterlab/wrl/pic/xiagao/pic/11.jpeg'
            print("Reading reference image : ", refFilename)
            imReference = cv2.imread(refFilename, cv2.IMREAD_COLOR)
            imFilename = list[i][1]
            # imgname2 = '/home/Jupyterlab/wrl/pic/xiagao/pic/12.jpeg'
            print("Reading image to align : ", imFilename);
            im = cv2.imread(imFilename, cv2.IMREAD_COLOR)
            print(refFilename)
            print(imFilename)
            print(folder_path[7:])
            p1 = refFilename.rfind('/')
            name1 = refFilename[p1:-4]
            print(name1)
            p2 = imFilename.rfind('/')
            name2 = imFilename[p2 + 1:]
            print(name2)
            # Write aligned image to disk.
            outFilename = "output/" + folder_path[7:]
            pre = os.getcwd()
            print("Saving aligned image : ", outFilename)
            print("Aligning images ...")
            # Registered image will be resotred in imReg.
            # The estimated homography will be stored in h.
            imReg, h, img5 = feat().alignImages(im, imReference)
            if (str(img5) == 'white'):
                print('white')
                # continue
            else:
                print(outFilename)
                # imgwrite需要建好路径！！！！！！
                if not os.path.exists(outFilename):
                    os.makedirs(outFilename)
                outFilename1 = outFilename + name1 + name2
                cv2.imwrite(outFilename1, img5)
                # continue
        outFullName = feat.zipDir(outFilename)
        pre1 = "/home/wwwroot/default/" + outFilename + ".zip"
        # return str(pre) + '/' + outFilename
        # return pre1
        return outFullName
    def alignImages(self, im1, im2):
        # Convert images to grayscale
        im1Gray = cv2.cvtColor(im1, cv2.COLOR_BGR2GRAY)
        im2Gray = cv2.cvtColor(im2, cv2.COLOR_BGR2GRAY)
        # Detect ORB features and compute descriptors.
        orb = cv2.ORB_create(MAX_MATCHES)
        keypoints1, descriptors1 = orb.detectAndCompute(im1Gray, None)
        keypoints2, descriptors2 = orb.detectAndCompute(im2Gray, None)
        if keypoints1  and keypoints2:
            # Match features.
            matcher = cv2.DescriptorMatcher_create(cv2.DESCRIPTOR_MATCHER_BRUTEFORCE_HAMMING)
            matches = matcher.match(descriptors1, descriptors2, None)
            # Sort matches by score
            matches.sort(key=lambda x: x.distance, reverse=False)
            # Remove not so good matches
            numGoodMatches = int(len(matches) * GOOD_MATCH_PERCENT)
            matches = matches[:numGoodMatches]
            # Draw top matches
            imMatches = cv2.drawMatches(im1, keypoints1, im2, keypoints2, matches, None)
            cv2.imwrite("matches.jpg", imMatches)
            # Extract location of good matches
            points1 = np.zeros((len(matches), 2), dtype=np.float32)
            points2 = np.zeros((len(matches), 2), dtype=np.float32)
            for i, match in enumerate(matches):
                points1[i, :] = keypoints1[match.queryIdx].pt
                points2[i, :] = keypoints2[match.trainIdx].pt
            if (points1.size == 0) or (points2.size == 0):
                return 'white', 'white', 'white'
            # Find homography
            h, mask = cv2.findHomography(points1, points2, cv2.RANSAC)
            # Use homography
            height, width, channels = im2.shape
            im1Reg = cv2.warpPerspective(im1, h, (width, height))
            img1 = cv2.drawKeypoints(im1, keypoints1, im1, color=(255, 0, 255))
            img2 = cv2.drawKeypoints(im2, keypoints2, im2, color=(255, 0, 255))
            img5 = cv2.drawMatches(img1, keypoints1, img2, keypoints2, matches, None, flags=2)
            return im1Reg, h, img5
        else:
            return 'white', 'white', 'white'
    def return_img_stream(self, img_local_path):
        # """
        # 工具函数:
        # 获取本地图片流
        # :param img_local_path:文件单张图片的本地绝对路径
        # :return: 图片流
        # """
        import base64
        img_stream = ''
        with open(img_local_path, 'r') as img_f:
            img_stream = img_f.read()
            img_stream = base64.b64encode(img_stream)
        return img_local_path
    def zipDir(dirpath):
        """
        压缩指定文件夹
        :param dirpath: 目标文件夹路径
        :param outFullName: 压缩文件保存路径+xxxx.zip
        :return: 无
        """
        outFullName = "/home/wwwroot/default/" + dirpath[21:] + ".zip"
        outFullName1 = "http://106.75.226.23/"  + dirpath[21:] + ".zip"
        # if not os.path.exists(outFullName):
        #     os.makedirs(outFullName)
        zip = zipfile.ZipFile(outFullName, "w", zipfile.ZIP_DEFLATED)
        for path, dirnames, filenames in os.walk(dirpath):
            # 去掉目标跟路径，只对目标文件夹下边的文件及文件夹进行压缩
            fpath = path.replace(dirpath, '')
            for filename in filenames:
                zip.write(os.path.join(path, filename), os.path.join(fpath, filename))
        zip.close()
        return outFullName1
 # im1 = '/Users/wrl/Desktop/test0506/pic/2303.png'
 # im2 = '/Users/wrl/Desktop/test0506/pic/2304.png'
 # img1 = cv2.imread(im1, cv2.IMREAD_COLOR)
 # img2 = cv2.imread(im2, cv2.IMREAD_COLOR)
 # imReg, h, img5 =feat().alignImages(img1,img2)
--- a/codes/pdf_to_pics.py
+++ b/codes/pdf_to_pics.py
@ -0,0 +1,91 @@
 import fitz
 import time
 import re
 import os
 from sim import compute_similarity
 from flask import redirect,url_for,Blueprint
 from PIL import Image
 import math
 import operator
 from functools import reduce
 bp_2pics = Blueprint("2pics", __name__,url_prefix="/2pics")
 # 这个bp似乎没啥用，，？
 class to_pics:
    def pdf2pic(self, path, pic_path):
        t0 = time.clock()
        # 使用正则表达式来查找图片
        checkXO = r"/Type(?= */XObject)"
        checkIM = r"/Subtype(?= */Image)"
        # 打开pdf
        doc = fitz.open(path)
        # 图片计数
        imgcount = 0
        lenXREF = doc._getXrefLength()
        print(lenXREF)
        # 打印PDF的信息
        print("文件名:{}, 页数: {}, 对象: {}".format(path, len(doc), lenXREF - 1))
        # 遍历每一个对象
        for i in range(1, lenXREF):
            # 定义对象字符串
            text = doc._getXrefString(i)
            isXObject = re.search(checkXO, text)
            # 使用正则表达式查看是否是图片
            isImage = re.search(checkIM, text)
            # 如果不是对象也不是图片，则continue
            if not isXObject or not isImage:
                continue
            imgcount += 1
            # 根据索引生成图像
            pix = fitz.Pixmap(doc, i)
            # 根据pdf的路径生成图片的名称
            # new_name = path.replace('\\', '_') + "_img{}.png".format(imgcount)
            new_name = "img{}.png".format(imgcount)
            new_name = new_name.replace(':', '')
            # 如果pix.n<5,可以直接存为PNG
            if pix.n < 5:
                pix.writePNG(os.path.join(pic_path, new_name))
            # 否则先转换CMYK
            else:
                pix0 = fitz.Pixmap(fitz.csRGB, pix)
                pix0.writePNG(os.path.join(pic_path, new_name))
                pix0 = None
        # 释放资源
        pix = None
        t1 = time.clock()
        print("运行时间:{}s".format(t1 - t0))
        print("提取了{}张图片".format(imgcount))
        return str(imgcount)
    def call_pdf2pic(self, file):
        # pdf路径
        path = './upload/' + file
        pic_path = path[:-4]
        # 创建保存图片的文件夹
        if os.path.exists(pic_path):
            print("文件夹已存在，请重新创建新文件夹！")
            return "already exist"
            # raise SystemExit
        else:
            os.mkdir(pic_path)
            print(pic_path)
            m = to_pics().pdf2pic(path, pic_path)
            # return m
            return redirect(url_for('call_sim',path = file[:-4]))
            # return redirect(url_for('call_sim', m=m, pic_path=pic_path))
 # @bp_2pics.route('/tt/<m>/<pic_path>')
 # def call_sim(m, pic_path):
 #     message = app.similarity(m, pic_path)
 #     print("call_sim")
 #     return message