Pirate bay search engine update

2025-03-13 05:41:17 +00:00 · 2014-10-12 20:15:18 +04:00 · 2014-10-12 20:15:18 +04:00 · a62e30ea88
commit a62e30ea88
parent 0e0e8f7c27
4 changed files with 204 additions and 166 deletions
--- a/src/searchengine/nova/engines/piratebay.py
+++ b/src/searchengine/nova/engines/piratebay.py
@ -1,6 +1,7 @@
-#VERSION: 1.53
+#VERSION: 2.00
 #AUTHORS: Fabien Devaux (fab@gnux.info)
 #CONTRIBUTORS: Christophe Dumez (chris@qbittorrent.org)
+#              Arthur (custparasite@gmx.se)

 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are met:
@ -27,94 +28,112 @@
 # POSSIBILITY OF SUCH DAMAGE.

 from novaprinter import prettyPrinter
-import sgmllib
-from helpers import retrieve_url, download_file
+from HTMLParser import HTMLParser
+from helpers import download_file
+import urllib2

 PREVIOUS_IDS = set()

 class piratebay(object):
-	url = 'https://thepiratebay.se'
-	name = 'The Pirate Bay'
-	supported_categories = {'all': '0', 'movies': '200', 'music': '100', 'games': '400', 'software': '300'}
+    url = 'http://thepiratebay.se'
+    name = 'The Pirate Bay'
+    supported_categories = {'all': '0', 'music': '100', 'movies': '200', 'games': '400', 'software': '300'}

-	def __init__(self):
-		self.results = []
-		self.parser = self.SimpleSGMLParser(self.results, self.url)
+    def download_torrent(self, info):
+        print(download_file(info))

-	def download_torrent(self, info):
-		print download_file(info)
+    class MyHtmlParseWithBlackJack(HTMLParser):
+        def __init__(self, results, url):
+            HTMLParser.__init__(self)
+            self.url = url
+            self.results = results
+            self.current_item = None
+            self.size_found = False
+            self.unit_found = False
+            self.seed_found = False
+            self.skip_td = False
+            self.leech_found = False
+            self.dispatcher = {'a'      : self.handle_tag_a_ref,
+                               'font'   : self.handle_tag_font_size,
+                               'td'     : self.handle_tag_td_sl      }

-	class SimpleSGMLParser(sgmllib.SGMLParser):
-		def __init__(self, results, url, *args):
-			sgmllib.SGMLParser.__init__(self)
-			self.td_counter = None
-			self.current_item = None
-			self.results = results
-			self.url = url
-			self.code = 0
-			self.in_name = None
+        def handle_tag_a_ref(self, attrs):
+            params = dict(attrs)
+            #1
+            if params['href'].startswith('/torrent/'):
+                get_id = params['href'].split('/')[2]
+                if not get_id in PREVIOUS_IDS:
+                    self.current_item = {}
+                    self.current_item['desc_link'] = self.url + params['href'].strip()
+                    self.current_item['name'] = params['title'][12:].strip()
+                    self.current_item['id'] = get_id
+            #2
+            elif (not self.current_item is None) and (params['href'].startswith('magnet:')):
+                self.current_item['link'] = params['href'].strip()

-		def start_a(self, attr):
-			params = dict(attr)
-			if params['href'].startswith('/torrent/'):
-				self.current_item = {}
-				self.td_counter = 0
-				self.current_item['desc_link'] = self.url + params['href'].strip()
-				self.in_name = True
-				self.current_item['id'] = params['href'].split('/')[2]
-			elif params['href'].startswith('magnet:'):
-				self.current_item['link']=params['href'].strip()
-				self.in_name = False
+        def handle_tag_font_size(self, attrs):
+            if not self.current_item is None:
+                params = dict(attrs)
+                #3
+                if params['class'] == "detDesc":
+                    self.size_found = True

-		def handle_data(self, data):
-			if self.td_counter == 0:
-				if self.in_name:
-					if not self.current_item.has_key('name'):
-						self.current_item['name'] = ''
-					self.current_item['name']+= data.strip()
-				else:
-					#Parse size
-					if 'Size' in data:
-						self.current_item['size'] = data[data.index("Size")+5:]
-						self.current_item['size'] = self.current_item['size'][:self.current_item['size'].index(',')]
-			elif self.td_counter == 1:
-				if not self.current_item.has_key('seeds'):
-					self.current_item['seeds'] = ''
-				self.current_item['seeds']+= data.strip()
-			elif self.td_counter == 2:
-				if not self.current_item.has_key('leech'):
-					self.current_item['leech'] = ''
-				self.current_item['leech']+= data.strip()
+        def handle_tag_td_sl(self, attrs):
+            if not self.current_item is None:
+                params = dict(attrs)
+                if not self.current_item is None:
+                    if self.seed_found:
+                        #5
+                        self.current_item['leech'] = ''
+                        self.leech_found = True
+                        self.seed_found = False
+                    else:
+                        #4
+                        self.current_item['seeds'] = ''
+                        self.seed_found = True

-		def start_td(self,attr):
-			if isinstance(self.td_counter,int):
-				self.td_counter += 1
-				if self.td_counter > 3:
-					self.td_counter = None
-					# Display item
-					if self.current_item:
-						if self.current_item['id'] in PREVIOUS_IDS:
-							self.results = []
-							self.reset()
-							return
-						self.current_item['engine_url'] = self.url
-						if not self.current_item['seeds'].isdigit():
-							self.current_item['seeds'] = 0
-						if not self.current_item['leech'].isdigit():
-							self.current_item['leech'] = 0
-						prettyPrinter(self.current_item)
-						PREVIOUS_IDS.add(self.current_item['id'])
-						self.results.append('a')
-	def search(self, what, cat='all'):
-		ret = []
-		i = 0
-		order = 'se'
-		while True and i<11:
-			results = []
-			parser = self.SimpleSGMLParser(results, self.url)
-			dat = retrieve_url(self.url+'/search/%s/%d/7/%s' % (what, i, self.supported_categories[cat]))
-			parser.feed(dat)
-			parser.close()
-			if len(results) <= 0:
-				break
-			i += 1
+        def handle_starttag(self, tag, attrs):
+            if tag in self.dispatcher:
+                self.dispatcher[tag](attrs)
+
+        def handle_data(self, data):
+            if not self.current_item is None:
+                if self.size_found:
+                    #with utf-8 you're going to have something like that: ['Uploaded', '10-02'], ['15:31,', 'Size', '240.34'], ['MiB,', 'ULed', 'by']
+                    temp = data.split()
+                    if 'Size' in temp:
+                        sizeIn = temp.index('Size')
+                        self.current_item['size'] = temp[sizeIn + 1]
+                        self.size_found = False
+                        self.unit_found = True
+                elif self.unit_found:
+                    temp = data.split()
+                    self.current_item['size'] = ' '.join((self.current_item['size'], temp[0]))
+                    self.unit_found = False
+                elif self.seed_found:
+                    self.current_item['seeds'] += data.rstrip()
+                elif self.leech_found:
+                    self.current_item['leech'] += data.rstrip()
+                    self.current_item['engine_url'] = self.url
+                    prettyPrinter(self.current_item)
+                    PREVIOUS_IDS.add(self.current_item['id'])
+                    self.results.append('a')
+                    self.current_item = None
+                    self.size_found = False
+                    self.unit_found = False
+                    self.seed_found = False
+                    self.leech_found = False
+
+    def search(self, what, cat='all'):
+        ret = []
+        i = 0
+        while i < 11:
+            results = []
+            parser = self.MyHtmlParseWithBlackJack(results, self.url)
+            query = '%s/search/%s/%d/99/%s' % (self.url, what, i, self.supported_categories[cat])
+            dat = urllib2.urlopen(query)
+            parser.feed(dat.read().decode('utf-8'))
+            parser.close()
+            if len(results) <= 0:
+                break
+            i += 1
--- a/src/searchengine/nova/engines/versions.txt
+++ b/src/searchengine/nova/engines/versions.txt
@ -1,6 +1,6 @@
 torrentreactor: 1.33
 mininova: 1.50
-piratebay: 1.53
+piratebay: 2.00
 vertor: 1.3
 extratorrent: 1.2
 kickasstorrents: 1.24
--- a/src/searchengine/nova3/engines/piratebay.py
+++ b/src/searchengine/nova3/engines/piratebay.py
@ -1,6 +1,7 @@
-#VERSION: 1.53
+#VERSION: 2.00
 #AUTHORS: Fabien Devaux (fab@gnux.info)
 #CONTRIBUTORS: Christophe Dumez (chris@qbittorrent.org)
+#              Arthur (custparasite@gmx.se)

 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are met:
@ -27,94 +28,112 @@
 # POSSIBILITY OF SUCH DAMAGE.

 from novaprinter import prettyPrinter
-import sgmllib3
-from helpers import retrieve_url, download_file
+from html.parser import HTMLParser
+from helpers import download_file
+import urllib.request

 PREVIOUS_IDS = set()

 class piratebay(object):
-	url = 'https://thepiratebay.se'
-	name = 'The Pirate Bay'
-	supported_categories = {'all': '0', 'movies': '200', 'music': '100', 'games': '400', 'software': '300'}
+    url = 'http://thepiratebay.se'
+    name = 'The Pirate Bay'
+    supported_categories = {'all': '0', 'music': '100', 'movies': '200', 'games': '400', 'software': '300'}

-	def __init__(self):
-		self.results = []
-		self.parser = self.SimpleSGMLParser(self.results, self.url)
+    def download_torrent(self, info):
+        print(download_file(info))

-	def download_torrent(self, info):
-		print(download_file(info))
+    class MyHtmlParseWithBlackJack(HTMLParser):
+        def __init__(self, results, url):
+            super().__init__()
+            self.url = url
+            self.results = results
+            self.current_item = None
+            self.size_found = False
+            self.unit_found = False
+            self.seed_found = False
+            self.skip_td = False
+            self.leech_found = False
+            self.dispatcher = {'a'      : self.handle_tag_a_ref,
+                               'font'   : self.handle_tag_font_size,
+                               'td'     : self.handle_tag_td_sl      }

-	class SimpleSGMLParser(sgmllib3.SGMLParser):
-		def __init__(self, results, url, *args):
-			sgmllib3.SGMLParser.__init__(self)
-			self.td_counter = None
-			self.current_item = None
-			self.results = results
-			self.url = url
-			self.code = 0
-			self.in_name = None
+        def handle_tag_a_ref(self, attrs):
+            params = dict(attrs)
+            #1
+            if params['href'].startswith('/torrent/'):
+                get_id = params['href'].split('/')[2]
+                if not get_id in PREVIOUS_IDS:
+                    self.current_item = {}
+                    self.current_item['desc_link'] = self.url + params['href'].strip()
+                    self.current_item['name'] = params['title'][12:].strip()
+                    self.current_item['id'] = get_id
+            #2
+            elif (not self.current_item is None) and (params['href'].startswith('magnet:')):
+                self.current_item['link'] = params['href'].strip()

-		def start_a(self, attr):
-			params = dict(attr)
-			if params['href'].startswith('/torrent/'):
-				self.current_item = {}
-				self.td_counter = 0
-				self.current_item['desc_link'] = self.url + params['href'].strip()
-				self.in_name = True
-				self.current_item['id'] = params['href'].split('/')[2]
-			elif params['href'].startswith('magnet:'):
-				self.current_item['link']=params['href'].strip()
-				self.in_name = False
+        def handle_tag_font_size(self, attrs):
+            if not self.current_item is None:
+                params = dict(attrs)
+                #3
+                if params['class'] == "detDesc":
+                    self.size_found = True

-		def handle_data(self, data):
-			if self.td_counter == 0:
-				if self.in_name:
-					if 'name' not in self.current_item:
-						self.current_item['name'] = ''
-					self.current_item['name']+= data.strip()
-				else:
-					#Parse size
-					if 'Size' in data:
-						self.current_item['size'] = data[data.index("Size")+5:]
-						self.current_item['size'] = self.current_item['size'][:self.current_item['size'].index(',')]
-			elif self.td_counter == 1:
-				if 'seeds' not in self.current_item:
-					self.current_item['seeds'] = ''
-				self.current_item['seeds']+= data.strip()
-			elif self.td_counter == 2:
-				if 'leech' not in self.current_item:
-					self.current_item['leech'] = ''
-				self.current_item['leech']+= data.strip()
+        def handle_tag_td_sl(self, attrs):
+            if not self.current_item is None:
+                params = dict(attrs)
+                if not self.current_item is None:
+                    if self.seed_found:
+                        #5
+                        self.current_item['leech'] = ''
+                        self.leech_found = True
+                        self.seed_found = False
+                    else:
+                        #4
+                        self.current_item['seeds'] = ''
+                        self.seed_found = True

-		def start_td(self,attr):
-			if isinstance(self.td_counter,int):
-				self.td_counter += 1
-				if self.td_counter > 3:
-					self.td_counter = None
-					# Display item
-					if self.current_item:
-						if self.current_item['id'] in PREVIOUS_IDS:
-							self.results = []
-							self.reset()
-							return
-						self.current_item['engine_url'] = self.url
-						if not self.current_item['seeds'].isdigit():
-							self.current_item['seeds'] = 0
-						if not self.current_item['leech'].isdigit():
-							self.current_item['leech'] = 0
-						prettyPrinter(self.current_item)
-						PREVIOUS_IDS.add(self.current_item['id'])
-						self.results.append('a')
-	def search(self, what, cat='all'):
-		ret = []
-		i = 0
-		order = 'se'
-		while True and i<11:
-			results = []
-			parser = self.SimpleSGMLParser(results, self.url)
-			dat = retrieve_url(self.url+'/search/%s/%d/7/%s' % (what, i, self.supported_categories[cat]))
-			parser.feed(dat)
-			parser.close()
-			if len(results) <= 0:
-				break
-			i += 1
+        def handle_starttag(self, tag, attrs):
+            if tag in self.dispatcher:
+                self.dispatcher[tag](attrs)
+
+        def handle_data(self, data):
+            if not self.current_item is None:
+                if self.size_found:
+                    #with utf-8 you're going to have something like that: ['Uploaded', '10-02'], ['15:31,', 'Size', '240.34'], ['MiB,', 'ULed', 'by']
+                    temp = data.split()
+                    if 'Size' in temp:
+                        sizeIn = temp.index('Size')
+                        self.current_item['size'] = temp[sizeIn + 1]
+                        self.size_found = False
+                        self.unit_found = True
+                elif self.unit_found:
+                    temp = data.split()
+                    self.current_item['size'] = ' '.join((self.current_item['size'], temp[0]))
+                    self.unit_found = False
+                elif self.seed_found:
+                    self.current_item['seeds'] += data.rstrip()
+                elif self.leech_found:
+                    self.current_item['leech'] += data.rstrip()
+                    self.current_item['engine_url'] = self.url
+                    prettyPrinter(self.current_item)
+                    PREVIOUS_IDS.add(self.current_item['id'])
+                    self.results.append('a')
+                    self.current_item = None
+                    self.size_found = False
+                    self.unit_found = False
+                    self.seed_found = False
+                    self.leech_found = False
+
+    def search(self, what, cat='all'):
+        ret = []
+        i = 0
+        while i < 11:
+            results = []
+            parser = self.MyHtmlParseWithBlackJack(results, self.url)
+            query = '%s/search/%s/%d/99/%s' % (self.url, what, i, self.supported_categories[cat])
+            dat = urllib.request.urlopen(query)
+            parser.feed(dat.read().decode('utf-8'))
+            parser.close()
+            if len(results) <= 0:
+                break
+            i += 1
--- a/src/searchengine/nova3/engines/versions.txt
+++ b/src/searchengine/nova3/engines/versions.txt
@ -1,6 +1,6 @@
 torrentreactor: 1.33
 mininova: 1.50
-piratebay: 1.53
+piratebay: 2.00
 vertor: 1.3
 extratorrent: 1.2
 kickasstorrents: 1.24