Author: Not specified Language: python
Description: Not specified Timestamp: 2017-09-27 08:50:45 +0000
View raw paste Reply
  1. import os
  2. import time
  3. from selenium import webdriver
  4.  
  5. fileWrite = open("proxyListSelenium.txt", "w")
  6.  
  7. browser = webdriver.Chrome()
  8.  
  9. print "Scraping idcloak.com"
  10. browser.get('http://www.idcloak.com/proxylist/proxy-list.html')
  11. i = 2
  12. page = 1
  13. while page < 10: #number of pages to scrap
  14.         browser.find_element_by_xpath('//*[@id="proxy-search"]/div[2]/div/input['+str(page)+']').click()
  15.         print "Page: "+str(page)
  16.         while i <= 101:
  17.                 try:
  18.                         ip = browser.find_element_by_xpath('//*[@id="sort"]/tbody/tr['+str(i)+']/td[8]')
  19.                         port = browser.find_element_by_xpath('//*[@id="sort"]/tbody/tr['+str(i)+']/td[7]')
  20.                         fileWrite.write(ip.text+":"+port.text+"\n")
  21.                 except:
  22.                         print "Exception caught - Moving on"
  23.                 i += 1
  24.         i = 2
  25.         page += 1
  26.        
  27. #-------------------------------------------------------------------------------------------
  28. print "Scraping cool-proxy.net"
  29. i = 2
  30. page = 1
  31. while page < 13:
  32.         browser.get('https://www.cool-proxy.net/proxies/http_proxy_list/page:'+str(page)+'/sort:score/direction:desc')
  33.         print "Page: "+str(page)
  34.         while i <= 22:
  35.                 if i != 7:     
  36.                         ip = browser.find_element_by_xpath('//*[@id="main"]/table/tbody/tr['+str(i)+']/td[1]')
  37.                         port = browser.find_element_by_xpath('//*[@id="main"]/table/tbody/tr['+str(i)+']/td[2]')
  38.                         fileWrite.write(ip.text+":"+port.text+"\n")
  39.                 i += 1
  40.         i = 2
  41.         page += 1
  42.  
  43. #-------------------------------------------------------------------------------------------
  44. print "Scraping premproxy.com"
  45. i = 0
  46. page = 1
  47. while page <= 15:
  48.         browser.get('https://premproxy.com/list/ip-port/'+str(page)+'.htm')
  49.         print "Page: "+str(page)
  50.         ip = browser.find_element_by_xpath('//*[@id="pricing"]/div/div/div/pre')
  51.         fileWrite.write(ip.text)
  52.         page += 1
  53.  
  54. #-------------------------------------------------------------------------------------------
  55. print "Scraping hidemy.name"
  56. i = 1
  57. page = 1
  58. browser.get('https://hidemy.name/en/proxy-list/?start=1#list')
  59. time.sleep(5)
  60. while page <= 768:
  61.         browser.get('https://hidemy.name/en/proxy-list/?start='+str(page)+'#list')
  62.         print "Page: "+str(page)
  63.         while i <= 64:
  64.                 ip = browser.find_element_by_xpath('//*[@id="content-section"]/section[1]/div/table/tbody/tr['+str(i)+']/td[1]')
  65.                 port = browser.find_element_by_xpath('//*[@id="content-section"]/section[1]/div/table/tbody/tr['+str(i)+']/td[2]')
  66.                 fileWrite.write(ip.text+":"+port.text+"\n")
  67.                 i += 1
  68.         i = 1
  69.         page += 64
  70.  
  71. #-------------------------------------------------------------------------------------------
  72. print "Scraping proxydb.net"
  73. i = 1
  74. page = 0
  75. while page <= 200:
  76.         browser.get('http://proxydb.net/?offset='+str(page))
  77.         print "Page: "+str(page)
  78.         while i <= 20:
  79.                 try:   
  80.                         ip = browser.find_element_by_xpath('/html/body/div[2]/table/tbody/tr['+str(i)+']/td[1]/a')
  81.                         fileWrite.write(ip.text+"\n")
  82.                 except:
  83.                         pass
  84.                 i += 1
  85.         page += 20
  86.         i = 1
  87.  
  88.        
  89.  
  90.  
  91. browser.quit()
  92.  
  93. fileWrite.close()
  94.  
  95. #remove duplicates
  96. lines_seen = set() # holds lines already seen
  97.  
  98. readFile = open("proxyListSelenium.txt", "r")
  99. fileWrite = open("unique.txt", "w")
  100. for line in readFile:
  101.     if line not in lines_seen: # not a duplicate
  102.         fileWrite.write(line)
  103.         lines_seen.add(line)
  104.  
  105. fileWrite.close()
  106. readFile.close()
  107.  
  108. os.remove("proxyListSelenium.txt")
  109. os.rename("unique.txt", "proxyListSelenium.txt")
View raw paste Reply