From 5f593316eb6914da1fda68dc1f1bcba6f3c0472b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=B1=A4=E8=B6=8A?= <1196656373@qq.com>
Date: Fri, 2 Sep 2022 08:32:12 +0800
Subject: [PATCH] Add comments and pydoc

---
 Src/medi_fetch/main.py | 280 +++++++++++++++++++++++++------------------------
 1 file changed, 141 insertions(+), 139 deletions(-)

diff --git a/Src/medi_fetch/main.py b/Src/medi_fetch/main.py
index fc88780..ded9066 100644
--- a/Src/medi_fetch/main.py
+++ b/Src/medi_fetch/main.py
@@ -1,139 +1,141 @@
-# Medicine Infomation Spider
-# Spider: requests
-# parser: XPath
-# database: ?
-# Modules required: requests,lxml,(database connection)
-# -*- coding: utf-8 -*-
-# author:Shen tongle in DASE_ECNU 2021
-
-import requests
-from lxml import etree
-import time
-import sqlite3
-
-
-class Medicine:
-    def __init__(self):
-        self.head = "https://www.315jiage.cn/"
-        self.user = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36"
-        self.headers = {
-            "User-agent": self.user
-        }
-        self.flag = 0
-        self.failed_num = 0
-        self.conn = sqlite3.connect('medicine.db')
-        self.cur = self.conn.cursor()
-
-
-    def get_data(self, type, page):
-        self.flag = 0
-        url = f"https://www.315jiage.cn/mc{type}p{page}.aspx"
-        response = requests.get(url, headers=self.headers, timeout=30)
-        self.content = response.content.decode("utf-8")
-        html = etree.HTML(self.content)
-        if html.xpath("//head/title/text()")[0][0] == "您":
-            self.flag += 1
-            return
-        results = html.xpath("""//div[@class="col-2"]/a[@target="_blank"]/@href""")
-        print(f"Browsing page {page}")
-        count = 0
-        for each_id in results:
-            try:
-                new_url = self.head + each_id
-                resp = requests.get(new_url, headers=self.headers)
-                content = resp.content.decode("utf-8")
-                ehtml = etree.HTML(content)
-
-                info = {}
-
-                name = ehtml.xpath("""//span[@itemprop="name"]/text()""")[0]
-                info["产品名称"] = name
-
-                base_info = ehtml.xpath("""//div[@class = "block-info-prop text-oneline"]//text()""")
-                titles = ['规格:', '剂型:', '包装单位:', '批准文号:', '生产厂家:', '条形码:', '主治疾病:']
-                for each in base_info:
-                    if each in titles:
-                        if each == '规格:' or each == '剂型:':
-                            for every_info in base_info[base_info.index(each) + 1:]:
-                                if every_info in titles:
-                                    break
-                                else:
-                                    info[each[:-1]] = every_info[:-3]
-                                    break
-                        elif each == '主治疾病:':
-                            for every_info in base_info[base_info.index(each) + 1:]:
-                                if every_info in titles:
-                                    break
-                                else:
-                                    illness = ""
-                                    info[each[:-1]] = illness.join(every_info.replace("\xa0", "").split())
-                                    break
-                        else:
-                            for every_info in base_info[base_info.index(each) + 1:]:
-                                if every_info in titles:
-                                    break
-                                else:
-                                    info[each[:-1]] = every_info
-                                    break
-                info["主治疾病"]="测试"
-                info["批准文号"] = ehtml.xpath("//td/div/u/a/text()")[0]
-
-                info["是否处方"] = bool(ehtml.xpath("""//td/div/span[@class="cRed"]"""))
-
-                temp = ehtml.xpath("""//ul[@class="property"]//text()""")
-                contents = []
-                for content in temp:
-                    if content not in contents:
-                        contents.append(content)
-
-                contents.pop(contents.index(" "))
-
-                # info["说明书"] = contents
-                SMS = ""
-
-                info["说明书"] = SMS.join(contents)
-
-                count += 1
-                print(f"Saving infomation {page}-{count}")
-                time.sleep(0.65)
-                # All infomation has been downloaded and preprocessed!
-                # -----------------------------------------
-                print(info)
-                # try:
-                #     self.cur.execute("INSERT INTO medicine_chengyao VALUES(?,?,?,?,?,?,?,?,?,?)",
-                #                      (info.get("产品名称", "NULL"), info.get("规格", "NULL"), info.get("剂型", "NULL"),
-                #                       info.get("包装单位", "NULL"), info.get("生产厂家", "NULL"),
-                #                       info.get("条形码", "NULL"), info.get("主治疾病", "NULL"), info.get("批准文号", "NULL"),
-                #                       info.get("是否处方", "NULL"), info.get("说明书", "NULL")))
-                #     self.conn.commit()
-                # except(sqlite3.Error):
-                #     self.conn.rollback()
-                #     print("saving error!")
-                # 'info' is going to be saved in one database
-                # Writing saving codes below......
-            except(IndexError,requests.HTTPError,TimeoutError,requests.exceptions.ConnectionError):
-                print("Failed to download data!")
-                self.failed_num += 1
-                continue
-
-
-
-
-def main():
-    medicine = Medicine()
-    types = [118, 119, 131]
-    a = [131]
-    for type in a:
-        for i in range(0,2500):
-            medicine.get_data(type, i + 1)
-            time.sleep(0.1)
-            if medicine.flag == 1:
-                break
-    print("下载失败的条目数量:", medicine.failed_num)
-    medicine.conn.close()
-
-
-if __name__ == "__main__":
-    main()
-
-# failed in 429, 430 is missing should be complemented
+# Medicine Infomation Spider
+# Spider: requests
+# parser: XPath
+# database: SQLite
+# Modules required: requests,lxml,(database connection)
+# -*- coding: utf-8 -*-
+# author:Shen tongle in DASE_ECNU 2021
+
+import requests
+from lxml import etree
+import time
+import sqlite3
+
+
+class Medicine:
+    def __init__(self):
+        self.head = "https://www.315jiage.cn/"
+        self.user = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36"
+        self.headers = {
+            "User-agent": self.user
+        }
+        self.flag = 0
+        self.failed_num = 0
+        self.conn = sqlite3.connect('medicine.db')
+        self.cur = self.conn.cursor()
+
+
+    def get_data(self, type, page):
+        '''
+        Obtain data from web, and store into the SQLite database
+        '''
+        self.flag = 0
+        url = f"https://www.315jiage.cn/mc{type}p{page}.aspx"
+        response = requests.get(url, headers=self.headers, timeout=30)
+        self.content = response.content.decode("utf-8")
+        html = etree.HTML(self.content)
+        if html.xpath("//head/title/text()")[0][0] == "您":
+            self.flag += 1
+            return
+        results = html.xpath("""//div[@class="col-2"]/a[@target="_blank"]/@href""")
+        print(f"Browsing page {page}")
+        count = 0
+        for each_id in results:
+            try:
+                new_url = self.head + each_id
+                resp = requests.get(new_url, headers=self.headers)
+                content = resp.content.decode("utf-8")
+                ehtml = etree.HTML(content)
+
+                info = {}
+
+                name = ehtml.xpath("""//span[@itemprop="name"]/text()""")[0]
+                info["产品名称"] = name
+
+                base_info = ehtml.xpath("""//div[@class = "block-info-prop text-oneline"]//text()""")
+                titles = ['规格:', '剂型:', '包装单位:', '批准文号:', '生产厂家:', '条形码:', '主治疾病:']
+                for each in base_info:
+                    if each in titles:
+                        if each == '规格:' or each == '剂型:':
+                            for every_info in base_info[base_info.index(each) + 1:]:
+                                if every_info in titles:
+                                    break
+                                else:
+                                    info[each[:-1]] = every_info[:-3]
+                                    break
+                        elif each == '主治疾病:':
+                            for every_info in base_info[base_info.index(each) + 1:]:
+                                if every_info in titles:
+                                    break
+                                else:
+                                    illness = ""
+                                    info[each[:-1]] = illness.join(every_info.replace("\xa0", "").split())
+                                    break
+                        else:
+                            for every_info in base_info[base_info.index(each) + 1:]:
+                                if every_info in titles:
+                                    break
+                                else:
+                                    info[each[:-1]] = every_info
+                                    break
+                info["主治疾病"]="测试"
+                info["批准文号"] = ehtml.xpath("//td/div/u/a/text()")[0]
+
+                info["是否处方"] = bool(ehtml.xpath("""//td/div/span[@class="cRed"]"""))
+
+                temp = ehtml.xpath("""//ul[@class="property"]//text()""")
+                contents = []
+                for content in temp:
+                    if content not in contents:
+                        contents.append(content)
+
+                contents.pop(contents.index(" "))
+
+                # info["说明书"] = contents
+                SMS = ""
+
+                info["说明书"] = SMS.join(contents)
+
+                count += 1
+                print(f"Saving infomation {page}-{count}")
+                time.sleep(0.65)
+                # All infomation has been downloaded and preprocessed!
+                # -----------------------------------------
+                print(info)
+                # The following code only use once, when obtaining raw data of the medicine info is required. Uncomment it when using.
+                # try:
+                #     self.cur.execute("INSERT INTO medicine_chengyao VALUES(?,?,?,?,?,?,?,?,?,?)",
+                #                      (info.get("产品名称", "NULL"), info.get("规格", "NULL"), info.get("剂型", "NULL"),
+                #                       info.get("包装单位", "NULL"), info.get("生产厂家", "NULL"),
+                #                       info.get("条形码", "NULL"), info.get("主治疾病", "NULL"), info.get("批准文号", "NULL"),
+                #                       info.get("是否处方", "NULL"), info.get("说明书", "NULL")))
+                #     self.conn.commit()
+                # except(sqlite3.Error):
+                #     self.conn.rollback()
+                #     print("saving error!")
+                # 'info' is going to be saved in one database
+                # Writing saving codes below......
+            except(IndexError,requests.HTTPError,TimeoutError,requests.exceptions.ConnectionError):
+                print("Failed to download data!")
+                self.failed_num += 1
+                continue
+
+
+
+
+def main():
+    medicine = Medicine()
+    types = [118, 119, 131]
+    a = [131]
+    for type in a:
+        for i in range(0,2500):
+            medicine.get_data(type, i + 1)
+            time.sleep(0.1)
+            if medicine.flag == 1:
+                break
+    print("下载失败的条目数量:", medicine.failed_num)
+    medicine.conn.close()
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file