You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

425 lines
16 KiB

2 years ago
  1. # coding=utf-8
  2. from lxml import etree
  3. import sqlite3
  4. import re
  5. import requests
  6. import random
  7. import time
  8. import logging
  9. user_agent = [
  10. "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 "
  11. "Safari/534.50",
  12. "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 "
  13. "Safari/534.50",
  14. "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0",
  15. "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR "
  16. "3.0.30729; .NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko",
  17. "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)",
  18. "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)",
  19. "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
  20. "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)",
  21. "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
  22. "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
  23. "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
  24. "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
  25. "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 "
  26. "Safari/535.11",
  27. "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",
  28. "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)",
  29. "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
  30. "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)",
  31. "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET "
  32. "CLR 2.0.50727; SE 2.X MetaSr 1.0)",
  33. "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
  34. "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)",
  35. "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
  36. "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) "
  37. "Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
  38. "Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) "
  39. "Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
  40. "Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) "
  41. "Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
  42. "Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) "
  43. "Version/4.0 Mobile Safari/533.1",
  44. "MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) "
  45. "AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
  46. "Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10",
  47. "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) "
  48. "Version/4.0 Safari/534.13",
  49. "Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 "
  50. "Mobile Safari/534.1+",
  51. "Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) "
  52. "wOSBrowser/233.70 Safari/534.6 TouchPad/1.0",
  53. "Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) "
  54. "AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124",
  55. "Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)",
  56. "UCWEB7.0.2.37/28/999",
  57. "NOKIA5700/ UCWEB7.0.2.37/28/999",
  58. "Openwave/ UCWEB7.0.2.37/28/999",
  59. "Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999",
  60. # iPhone 6:
  61. "Mozilla/6.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/8.0 "
  62. "Mobile/10A5376e Safari/8536.25",
  63. ]
  64. def get_user_agent():
  65. headers = {"User-Agent": random.choice(user_agent)}
  66. return headers
  67. class Scraper:
  68. database: str
  69. tag: str
  70. page: int
  71. def __init__(self):
  72. self.database = "book.db"
  73. self.tag = ""
  74. self.page = 0
  75. self.pattern_number = re.compile(r"\d+\.?\d*")
  76. logging.basicConfig(filename="scraper.log", level=logging.ERROR)
  77. def get_current_progress(self) -> ():
  78. conn = sqlite3.connect(self.database)
  79. results = conn.execute("SELECT tag, page from progress where id = '0'")
  80. for row in results:
  81. return row[0], row[1]
  82. return "", 0
  83. def save_current_progress(self, current_tag, current_page):
  84. conn = sqlite3.connect(self.database)
  85. conn.execute(
  86. "UPDATE progress set tag = '{}', page = {} where id = '0'".format(
  87. current_tag, current_page
  88. )
  89. )
  90. conn.commit()
  91. conn.close()
  92. def start_grab(self) -> bool:
  93. self.create_tables()
  94. scraper.grab_tag()
  95. current_tag, current_page = self.get_current_progress()
  96. tags = self.get_tag_list()
  97. for i in range(0, len(tags)):
  98. no = 0
  99. if i == 0 and current_tag == tags[i]:
  100. no = current_page
  101. while self.grab_book_list(tags[i], no):
  102. no = no + 20
  103. return True
  104. def create_tables(self):
  105. conn = sqlite3.connect(self.database)
  106. try:
  107. conn.execute("CREATE TABLE tags (tag TEXT PRIMARY KEY)")
  108. conn.commit()
  109. except sqlite3.Error as e:
  110. logging.error(str(e))
  111. conn.rollback()
  112. try:
  113. conn.execute(
  114. "CREATE TABLE book ("
  115. "id TEXT PRIMARY KEY, title TEXT, author TEXT, "
  116. "publisher TEXT, original_title TEXT, "
  117. "translator TEXT, pub_year TEXT, pages INTEGER, "
  118. "price INTEGER, currency_unit TEXT, binding TEXT, "
  119. "isbn TEXT, author_intro TEXT, book_intro text, "
  120. "content TEXT, tags TEXT, picture BLOB)"
  121. )
  122. conn.commit()
  123. except sqlite3.Error as e:
  124. logging.error(str(e))
  125. conn.rollback()
  126. try:
  127. conn.execute(
  128. "CREATE TABLE progress (id TEXT PRIMARY KEY, tag TEXT, page integer )"
  129. )
  130. conn.execute("INSERT INTO progress values('0', '', 0)")
  131. conn.commit()
  132. except sqlite3.Error as e:
  133. logging.error(str(e))
  134. conn.rollback()
  135. def grab_tag(self):
  136. url = "https://book.douban.com/tag/?view=cloud"
  137. r = requests.get(url, headers=get_user_agent())
  138. r.encoding = "utf-8"
  139. h: etree.ElementBase = etree.HTML(r.text)
  140. tags: [] = h.xpath(
  141. '/html/body/div[@id="wrapper"]/div[@id="content"]'
  142. '/div[@class="grid-16-8 clearfix"]/div[@class="article"]'
  143. '/div[@class=""]/div[@class="indent tag_cloud"]'
  144. "/table/tbody/tr/td/a/@href"
  145. )
  146. conn = sqlite3.connect(self.database)
  147. c = conn.cursor()
  148. try:
  149. for tag in tags:
  150. t: str = tag.strip("/tag")
  151. c.execute("INSERT INTO tags VALUES ('{}')".format(t))
  152. c.close()
  153. conn.commit()
  154. conn.close()
  155. except sqlite3.Error as e:
  156. logging.error(str(e))
  157. conn.rollback()
  158. return False
  159. return True
  160. def grab_book_list(self, tag="小说", pageno=1) -> bool:
  161. logging.info("start to grab tag {} page {}...".format(tag, pageno))
  162. self.save_current_progress(tag, pageno)
  163. url = "https://book.douban.com/tag/{}?start={}&type=T".format(tag, pageno)
  164. r = requests.get(url, headers=get_user_agent())
  165. r.encoding = "utf-8"
  166. h: etree.Element = etree.HTML(r.text)
  167. li_list: [] = h.xpath(
  168. '/html/body/div[@id="wrapper"]/div[@id="content"]'
  169. '/div[@class="grid-16-8 clearfix"]'
  170. '/div[@class="article"]/div[@id="subject_list"]'
  171. '/ul/li/div[@class="info"]/h2/a/@href'
  172. )
  173. next_page = h.xpath(
  174. '/html/body/div[@id="wrapper"]/div[@id="content"]'
  175. '/div[@class="grid-16-8 clearfix"]'
  176. '/div[@class="article"]/div[@id="subject_list"]'
  177. '/div[@class="paginator"]/span[@class="next"]/a[@href]'
  178. )
  179. has_next = True
  180. if len(next_page) == 0:
  181. has_next = False
  182. if len(li_list) == 0:
  183. return False
  184. for li in li_list:
  185. li.strip("")
  186. book_id = li.strip("/").split("/")[-1]
  187. try:
  188. delay = float(random.randint(0, 200)) / 100.0
  189. time.sleep(delay)
  190. self.crow_book_info(book_id)
  191. except BaseException as e:
  192. logging.error(
  193. logging.error("error when scrape {}, {}".format(book_id, str(e)))
  194. )
  195. return has_next
  196. def get_tag_list(self) -> [str]:
  197. ret = []
  198. conn = sqlite3.connect(self.database)
  199. results = conn.execute(
  200. "SELECT tags.tag from tags join progress where tags.tag >= progress.tag"
  201. )
  202. for row in results:
  203. ret.append(row[0])
  204. return ret
  205. def crow_book_info(self, book_id) -> bool:
  206. conn = sqlite3.connect(self.database)
  207. for _ in conn.execute("SELECT id from book where id = ('{}')".format(book_id)):
  208. return
  209. url = "https://book.douban.com/subject/{}/".format(book_id)
  210. r = requests.get(url, headers=get_user_agent())
  211. r.encoding = "utf-8"
  212. h: etree.Element = etree.HTML(r.text)
  213. e_text = h.xpath('/html/body/div[@id="wrapper"]/h1/span/text()')
  214. if len(e_text) == 0:
  215. return False
  216. title = e_text[0]
  217. elements = h.xpath(
  218. '/html/body/div[@id="wrapper"]'
  219. '/div[@id="content"]/div[@class="grid-16-8 clearfix"]'
  220. '/div[@class="article"]'
  221. )
  222. if len(elements) == 0:
  223. return False
  224. e_article = elements[0]
  225. book_intro = ""
  226. author_intro = ""
  227. content = ""
  228. tags = ""
  229. e_book_intro = e_article.xpath(
  230. 'div[@class="related_info"]'
  231. '/div[@class="indent"][@id="link-report"]/*'
  232. '/div[@class="intro"]/*/text()'
  233. )
  234. for line in e_book_intro:
  235. line = line.strip()
  236. if line != "":
  237. book_intro = book_intro + line + "\n"
  238. e_author_intro = e_article.xpath(
  239. 'div[@class="related_info"]'
  240. '/div[@class="indent "]/*'
  241. '/div[@class="intro"]/*/text()'
  242. )
  243. for line in e_author_intro:
  244. line = line.strip()
  245. if line != "":
  246. author_intro = author_intro + line + "\n"
  247. e_content = e_article.xpath(
  248. 'div[@class="related_info"]'
  249. '/div[@class="indent"][@id="dir_' + book_id + '_full"]/text()'
  250. )
  251. for line in e_content:
  252. line = line.strip()
  253. if line != "":
  254. content = content + line + "\n"
  255. e_tags = e_article.xpath(
  256. 'div[@class="related_info"]/'
  257. 'div[@id="db-tags-section"]/'
  258. 'div[@class="indent"]/span/a/text()'
  259. )
  260. for line in e_tags:
  261. line = line.strip()
  262. if line != "":
  263. tags = tags + line + "\n"
  264. e_subject = e_article.xpath(
  265. 'div[@class="indent"]'
  266. '/div[@class="subjectwrap clearfix"]'
  267. '/div[@class="subject clearfix"]'
  268. )
  269. pic_href = e_subject[0].xpath('div[@id="mainpic"]/a/@href')
  270. picture = None
  271. if len(pic_href) > 0:
  272. res = requests.get(pic_href[0], headers=get_user_agent())
  273. picture = res.content
  274. info_children = e_subject[0].xpath('div[@id="info"]/child::node()')
  275. e_array = []
  276. e_dict = dict()
  277. for e in info_children:
  278. if isinstance(e, etree._ElementUnicodeResult):
  279. e_dict["text"] = e
  280. elif isinstance(e, etree._Element):
  281. if e.tag == "br":
  282. e_array.append(e_dict)
  283. e_dict = dict()
  284. else:
  285. e_dict[e.tag] = e
  286. book_info = dict()
  287. for d in e_array:
  288. label = ""
  289. span = d.get("span")
  290. a_label = span.xpath("span/text()")
  291. if len(a_label) > 0 and label == "":
  292. label = a_label[0].strip()
  293. a_label = span.xpath("text()")
  294. if len(a_label) > 0 and label == "":
  295. label = a_label[0].strip()
  296. label = label.strip(":")
  297. text = d.get("text").strip()
  298. e_a = d.get("a")
  299. text.strip()
  300. text.strip(":")
  301. if label == "作者" or label == "译者":
  302. a = span.xpath("a/text()")
  303. if text == "" and len(a) == 1:
  304. text = a[0].strip()
  305. if text == "" and e_a is not None:
  306. text_a = e_a.xpath("text()")
  307. if len(text_a) > 0:
  308. text = text_a[0].strip()
  309. text = re.sub(r"\s+", " ", text)
  310. if text != "":
  311. book_info[label] = text
  312. sql = (
  313. "INSERT INTO book("
  314. "id, title, author, "
  315. "publisher, original_title, translator, "
  316. "pub_year, pages, price, "
  317. "currency_unit, binding, isbn, "
  318. "author_intro, book_intro, content, "
  319. "tags, picture)"
  320. "VALUES("
  321. "?, ?, ?, "
  322. "?, ?, ?, "
  323. "?, ?, ?, "
  324. "?, ?, ?, "
  325. "?, ?, ?, "
  326. "?, ?)"
  327. )
  328. unit = None
  329. price = None
  330. pages = None
  331. conn = sqlite3.connect(self.database)
  332. try:
  333. s_price = book_info.get("定价")
  334. if s_price is None:
  335. # price cannot be NULL
  336. logging.error(
  337. "error when scrape book_id {}, cannot retrieve price...", book_id
  338. )
  339. return None
  340. else:
  341. e = re.findall(self.pattern_number, s_price)
  342. if len(e) != 0:
  343. number = e[0]
  344. unit = s_price.replace(number, "").strip()
  345. price = int(float(number) * 100)
  346. s_pages = book_info.get("页数")
  347. if s_pages is not None:
  348. # pages can be NULL
  349. e = re.findall(self.pattern_number, s_pages)
  350. if len(e) != 0:
  351. pages = int(e[0])
  352. conn.execute(
  353. sql,
  354. (
  355. book_id,
  356. title,
  357. book_info.get("作者"),
  358. book_info.get("出版社"),
  359. book_info.get("原作名"),
  360. book_info.get("译者"),
  361. book_info.get("出版年"),
  362. pages,
  363. price,
  364. unit,
  365. book_info.get("装帧"),
  366. book_info.get("ISBN"),
  367. author_intro,
  368. book_intro,
  369. content,
  370. tags,
  371. picture,
  372. ),
  373. )
  374. conn.commit()
  375. except sqlite3.Error as e:
  376. logging(str(e))
  377. conn.rollback()
  378. except TypeError as e:
  379. logging.error("error when scrape {}, {}".format(book_id, str(e)))
  380. conn.rollback()
  381. return False
  382. conn.close()
  383. return True
  384. if __name__ == "__main__":
  385. scraper = Scraper()
  386. scraper.start_grab()