首页 > > 网络编程 > 其它 >

python爬去电影天堂恐怖片+游戏

2018-06-18 00:15:33来源：未知阅读 ()

1、爬去方式python+selenium

2、工作流程

selenium自动输入，自动爬取，建立文件夹，存入磁力链接到记事本

3、贴上代码

#!/usr/bin/Python
# -*- coding: utf-8 -*-
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from bs4 import BeautifulSoup
import os
import urllib2
import time
import random
import re
browser = webdriver.Chrome()
#browser.set_window_position(20, 40)
#browser.set_window_size(1100, 700)
browser.maximize_window() #最大化
#隐式等待
browser.implicitly_wait(10)
browser.get('http://www.dytt8.net/')
browser.find_element_by_xpath('//*[@id="header"]/div/div[3]/div[2]/div[2]/div[1]/div/div[1]/form/div[1]/p[1]/input').clear()
browser.find_element_by_xpath('//*[@id="header"]/div/div[3]/div[2]/div[2]/div[1]/div/div[1]/form/div[1]/p[1]/input').send_keys(u'恐怖')

def close(browser):
    # 获取当前窗口句柄（窗口A）
    handle = browser.current_window_handle
    # 获取当前所有窗口句柄（窗口A、B）
    handles = browser.window_handles
    # 对窗口进行遍历
    for newhandle in handles:
        # 筛选新打开的窗口B
        if newhandle!=handle:
            browser.switch_to_window(newhandle)
            # 关闭当前窗口B
            browser.close()
            browser.switch_to_window(handles[0])

def change(browser):
    # 获取当前窗口句柄（窗口A）
    handle = browser.current_window_handle
    # 获取当前所有窗口句柄（窗口A、B）
    handles = browser.window_handles
    # 对窗口进行遍历
    for newhandle in handles:
        # 筛选新打开的窗口B
        if newhandle!=handle:
            browser.switch_to_window(newhandle)
            
def back(browser):
    # 获取当前窗口句柄（窗口A）
    handle = browser.current_window_handle
    # 获取当前所有窗口句柄（窗口A、B）
    handles = browser.window_handles
    # 对窗口进行遍历
    for newhandle in handles:
        # 筛选新打开的窗口B
        if newhandle==handle:
            browser.switch_to_window(newhandle)
            # 关闭当前窗口B
            browser.close()
            browser.switch_to_window(handles[0])

def backN(browser):
    # 获取当前窗口句柄（窗口A）
    handle = browser.current_window_handle
    # 获取当前所有窗口句柄（窗口A、B）
    handles = browser.window_handles
    # 对窗口进行遍历
    for newhandle in handles:
        # 筛选新打开的窗口B
        if newhandle!=handle:
            browser.switch_to_window(newhandle)
            # 关闭当前窗口B
            browser.close()
            browser.switch_to_window(handles[1])

close(browser)
browser.find_element_by_xpath('//*[@id="header"]/div/div[3]/div[2]/div[2]/div[1]/div/div[1]/form/div[1]/p[1]/input').clear()
browser.find_element_by_xpath('//*[@id="header"]/div/div[3]/div[2]/div[2]/div[1]/div/div[1]/form/div[1]/p[1]/input').send_keys(u'恐怖')
ele = browser.find_element_by_xpath('//*[@id="header"]/div/div[3]/div[2]/div[2]/div[1]/div/div[1]/form/div[2]/input')
##直接点击不了
#模拟点击事件
ele.send_keys(Keys.ENTER)
##成功跳转到页面
obj = BeautifulSoup(browser.page_source, 'html.parser')

def fun(obj, num):
    list = []
    list = obj.find('div',{'class':'co_content8'}).find_all('table')
    for i, v in enumerate(list):
        if i<=9:
            href = v.find('a').get('href')
            title = v.find('a').text
            ##去掉特殊的符号
            title  = re.sub('[\/:*?"<>|]','-',title)
            disk_url = 'E:/test/dytt/bt/'+title+''
            #开始创建文件夹
            if os.path.exists('E:/test/dytt/bt/'+title+''):
                print 'This folder already exists!'
            else:
                os.mkdir(r'E:/test/dytt/bt/'+title+'')
            print title
            #url = 'http://www.ygdy8.com'+href+''
            ###打开一个新窗口
            js = " window.open('http://www.ygdy8.com"+href+"')"
            browser.execute_script(js) 
            ##跳转到新页面
            #browser.get(url)
            #切换到b窗口
            change(browser)
            #右键点击那个链接
            try:
                qqq =browser.find_element_by_xpath('//*[@id="Zoom"]/span/table/tbody/tr/td/a')
                ActionChains(browser).context_click(qqq).perform()
                hrefs = browser.find_element_by_xpath('//*[@id="Zoom"]/span/table/tbody/tr/td/a').get_attribute('href')
                print hrefs
                file = open('E:\\test\\dytt\\bt\\'+title+'\\bt.txt', 'w')
                file.write(hrefs)
                file.close()
            except:
                print 'WE can try another way!'
                try:
                    qqq =browser.find_element_by_xpath('//*[@id="Zoom"]/span/div[5]/table/tbody/tr/td/a')
                    ActionChains(browser).context_click(qqq).perform()
                    hrefs = browser.find_element_by_xpath('//*[@id="Zoom"]/span/div[5]/table/tbody/tr/td/a').get_attribute('href')
                    print hrefs
                    file = open('E:\\test\\dytt\\bt\\'+title+'\\bt.txt', 'w')
                    file.write(hrefs)
                    file.close()
                except:
                    print 'This is a game!'
            back(browser)        
    #循环完之后
    if num==0:
        browser.find_element_by_xpath('//*[@id="header"]/div/div[3]/div[3]/div[2]/div[2]/div[2]/ul/table[11]/tbody/tr/td[9]/a').click()
    else:
        browser.find_element_by_xpath('//*[@id="header"]/div/div[3]/div[3]/div[2]/div[2]/div[2]/ul/table[11]/tbody/tr/td[10]/a').click()
    change(browser)
    backN(browser)
    obj = BeautifulSoup(browser.page_source, 'html.parser')
    fun(obj, 1)

def get_html(url):
    '''获取html'''
    ##定义headers
    user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"
    headers={"User-Agent":user_agent}
    request = urllib2.Request(url, headers=headers)
    #request.encoding = 'utf-8'
    try:
        html = urllib2.urlopen(request).read()
    except urllib2.URLError as e:
        print url+'Download error:', e.reason
        html = None
    return html

fun(obj, 0)