qs='xxxxx'print(parse.parse_qs(qs))
headers={
}
opener=request.build_opener(handler)
resp=opener.open(req)
print(resp.read())
fromurllibimportrequest
'User-Agent':"xxxx",
'cookie':'xxxx'
print(resp.read().decode('utf-8'))
withopen('xxx.html','w',encoding='utf-8')asfp:
fp.write(resp.read().decode('utf-8'))
fromurllibimportrequest,parse
'User-Agent':'xxxxx'
defget_opener():
cookiejar=CookieJar()
handler=request.HTTPCookieProcessor(cookiejar)
opener=request.bulid_opener(handler)
returnopener
deflogin_the_url(opener):
data={"name":"xxxxx","password":"xxxxxx"}
data=parse.urlencode(data).encode('utf-8')
req=request.Request(login_url,headers=headers,data=data)
opener.open(req)
defvisit_profile(opener):
req=request.Request(url,headers=headers)
fp.write(resp.read().decode("utf-8"))
if__name__='main':
opener=get_opener()
login_the_url(opener)
visit_profile(opener)
importrequest
kw={"wd":"xxx"}
headers={"User-Agent":"xxx"}
print(response.text)
print(response.content)
'first':'true',
'pn':1,
'kd':'python'
resp=request.post(url,headers=headers,data=data)
importrequests
proxy={
data={
"name":"xxx","password":"xxx"
'User-Agent':"xxx"
session.post(url,data=data,headers=headers)
print(resp.content.decode('utf-8'))
//div[contains(@class,"f1")]
html=etree.HTML(text)
result=etree.tostring(text,encoding='utf-8')
result.decode('utf-8')
parser=etree.HTMLParser(encoding='utf-8')
fromlxmlimportetree
html=etree.parse("tencent.html",parser=parser)
trs=html.xpath("//tr")
fortrintrs:
#print(tr)
print(etree.tostring(tr,encoding="utf-8").decode("utf-8"))
print(trs)
print(etree.tostring(trs,encoding='utf-8').decode("utf-8"))
evens=html.xpath("//tr[@class='even']")
foreveninevens:
print(etree.tostring(even,encoding="utf-8").decode("utf-8"))
"""
all_things=[]
forwordinwords:
title=tr.xpath(".//a/text()")
title=tr.xpath("./td/text()")
title1=tr.xpath("./td[1]//text()")
all_thing={
"second":title2
break
'User-Agent':"Mozilla/5.0(WindowsNT10.0;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/79.0.3945.16Safari/537.36",
"=1&rsv_idx=2&ie=utf-8&tn=62095104_19_oem_dg&rsv_enter=1&rsv_dl=ib&rsv_sug3=8&rsv_sug1=5&rsv_sug7=100"
"&rsv_sug2=0&inputT=1250&rsv_sug4=1784"
response=requests.get(url,headers=headers)
#text=open("Douban.text",'r',encoding="utf-8")
#print(response.text)
print(html)
ul=html.xpath("//ul")[0]
print(ul)
lts=ul.xpath("./li")
forliinlts:
title=li.xpath("@data-title")
data_release=li.xpath("@data-release")
#data_duration=li.xpath("@data-ticketdata-duration")
data_region=li.xpath("@data-region")
data_actors=li.xpath("@data-actors")
post=li.xpath(".//img/@scr")
print(data_actors)
print(post)
movie={
'title':title,
'data_release':data_release
HEADERS={
'User-Agent':'Mozilla/5.0(WindowsNT10.0;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/79.0.3945.16Safari/537.36'
defget_detail_urls(url):
response=requests.get(url,headers=HEADERS)
text=response.content.decode(encoding='gbk',errors='ignore')
#fordetail_urlindetail_urls:
#print(BASE_URL+detail_url)
detail_urls=map(lambdaurl:BASE_URL+url,detail_urls)
returndetail_urls
#defabc(url):
#returnBASE_URL+url
#index=0
#detail_url=abc(detail_url)
#detail_urls[index]=detail_url
#index+=1
defspider():
movies=[]
forxinrange(1,7):
print("==================================")
print(x)
url=base_url.format(x)
detail_urls=get_detail_urls(url)
fordetail_urlindetail_urls:
#print(detail_url)
movie=parse_detail_page(detail_url)
movies.append(movie)
defparse_detail_page(url):
movie={}
#titles=html.xpath("//font[@color='#07519a']")
#print(titles)
#fortitleintitles:
#print(etree.tostring(title,encoding='utf-8').decode('utf-8'))
movie['titile']=title
zoomE=html.xpath("//div[@id='Zoom']")[0]
post_imgs=zoomE.xpath(".//img/@src")
movie['post_imgs']=post_imgs
#print(post_imgs)
infos=zoomE.xpath(".//text()")
#print(infos)
defparse_info(info,rule):
returninfo.replace(rule,"").strip()
#forinfoininfos:
forindex,infoinenumerate(infos):
#print(info)
movie["year"]=info
movie["country"]=info
movie["category"]=info
movie["douban_score"]=info
movie["duration"]=info
movie["director"]=info
actors=[info]
forxinrange(index+1,len(infos)):
actor=infos[x].strip()
actors.append(actor)
movie['actors']=actors
profile=infos[x].strip()
movie["profile"]=profile
movie["download_url"]=download_url
returnmovie
if__name__=='__main__':
spider()
frombs4importBeautifulSoup
html="""
xxxxxx
"""
soup=BeautifulSoup(html,"lxml")
trs=soup.find_all('tr')
print(tr)
print(type(tr))
trs=soup.find_all('tr',limit=2)
forainaList:
print(a)
infos_=[]
info={}
info['title']=title
info['category']=category
info['nums']=nums
info['city']=city
info['pubtime']=pubtime
infos_.append(info)
#infos=tr.strings
#infos=list(tr.string)
info['title']=infos[0]
info['category']=infos[1]
info['nums']=infos[2]
info['city']=infos[3]
info['pubtime']=infos[4]
importhtml5lib
frompyecharts.chartsimportBar
ALL_Data=[]
defparse_page(url):
'User-Agent':'Mozilla/5.0(WindowsNT10.0;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/79.0.3945.16Safari/537.36',
text=response.content.decode('utf-8')
soup=BeautifulSoup(text,'html5lib')
#print(conMidTab)
tables=conMidTab.find_all('table')
fortableintables:
trs=table.find_all('tr')[2:]
ifindex==0:
ALL_Data.append({"city":city,"min_temp":int(min_temp)})
#print({"city":city,"min_temp":min_temp})
defmain():
urls=["hb","db","hd","hn","xb","xn","gat"]
foridinurls:
parse_page(url)
#defsort_key(data):
#min_temp=data['min_temp']
#returnmin_temp
#ALL_Data.sort(key=sort_key)
data=ALL_Data[0:10]
#forvalueindata:
#city=value['city']
#cities.append(city)
cities_=list(map(lambdax:x['city'],data))
temps_=list(map(lambdax:x['min_temp'],data))
chart.add_yaxis(series_name="thetitle",xaxis_index=cities_,yaxis_data=temps_)
main()
soup.select('.sister')
soup.select('#link')
.line1{
background-color:pink;
#line2{
background-color:rebeccapurple;
background-color:azure;
background-color:aqua;
input[name='username']
{
background-color:coral;
thefourthdata
5.soup+select
text='hello'
>>he
text="ab"
print(ret.group())
>>a
text="123"
>>1
text="2a"
text=""
>>
text="_"
>>_
text="+"
>>+
text="0888-88888"
>>0888-88888
ret=re.match('\d*',text)
>>0888
text="abcd"#text="+abcd"
ret=re.match('\w+',text)
>>abcd#>>ab
ret=re.match('\w',text)
ret=re.match('\w{2}',text)
text="hello"
>>h
text="99"
text="themacbookprois$1999"
>>$299
text="\\n"
>>\n
text="apple'sprice$99,orange'spriceis$10"
ret=re.search('.*(\$\d+).*(\$\d+)',text)
findall
sub
text"helloworldnihao"ret=re.split('',text)print(ret)#['hello','world','ni','hao']
comlie:
text="thenumberis20.50"
r=re.compile('\d+\.\d*')
r=re.compile("""
""",re.VERBOSE)
importre,requests
'user-agent':'Mozilla/5.0(WindowsNT10.0;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/79.0.3945.16Safari/537.36'
text=response.text
contents=[]
forcontentincontent_tag:
#print(content)
contents.append(x.strip())
poems=[]
forvalueinzip(titles,dynasties,authors,contents):
title,dynasty,author,content=value
more_peoms={
'daynastie':dynasty,
'authors':author,
'content':content
poems.append(more_peoms)
forpoeminpoems:
print(poem)
forpageinrange(10):
importjson
persons=[
'username':"zhilioa",
'age':18,
'country':"china"
},
'username':"zhaxiaolie",
'age':20,
]
json_str=json.dumps(persons)
print(json_str)
withopen('person.json','w',encoding='utf-8')asfp:
fp.write(json_str)
classPerson(object):
country='china'
a={
'person':Person
persons=json.load(xxxx)
importcsv
defread_csv_demo1():
withopen('stock.csv','r')asfp:
forxinreader:
defread_csv_dome2():
reader=csv.DictReader(fp)
value={"name"=x['secShortname'],"volume"=x['turnoverVol']}
print(value)
if__name__=='__main__'
read_csv_demo2()
header=['username','age','height']
defwriter_csv_demo1():
values=[('zhanghan',12,1800),
('lisi',14,111)]
withopen('class.csv','w',encoding="utf-8",newline='')asfp: